Source code for easytransfer.app_zoo.text_classify

# coding=utf-8
# Copyright (c) 2019 Alibaba PAI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import tensorflow as tf
from easytransfer import preprocessors, model_zoo
from easytransfer.app_zoo.base import ApplicationModel
from easytransfer.evaluators import classification_eval_metrics, multi_label_eval_metrics, regression_eval_metrics
import easytransfer.layers as layers
from easytransfer.losses import mean_square_error, multi_label_sigmoid_cross_entropy, softmax_cross_entropy
from easytransfer.preprocessors.deeptext_preprocessor import DeepTextPreprocessor
from easytransfer.model_zoo.modeling_dgcnn import DGCNNPreTrainedModel


[docs]class BaseTextClassify(ApplicationModel): def __init__(self, **kwargs): """ Basic Text Classification Model """ super(BaseTextClassify, self).__init__(**kwargs)
[docs] @staticmethod def default_model_params(): """ The default value of the Text Classification Model """ raise NotImplementedError
[docs] def build_logits(self, features, mode=None): """ Building graph of the Text Classification Model """ raise NotImplementedError
[docs] def build_loss(self, logits, labels): """ Building loss for training the Text Classification Model """ if hasattr(self.config, "multi_label") and self.config.multi_label: return multi_label_sigmoid_cross_entropy(labels, self.config.num_labels, logits) elif self.config.num_labels == 1: return mean_square_error(labels, logits) else: return softmax_cross_entropy(labels, self.config.num_labels, logits)
[docs] def build_eval_metrics(self, logits, labels): """ Building evaluation metrics while evaluating Args: logits (`Tensor`): shape of [None, num_labels] labels (`Tensor`): shape of [None] Returns: ret_dict (`dict`): A dict with (`py_accuracy`, `py_micro_f1`, `py_macro_f1`) tf.metrics op """ if hasattr(self.config, "multi_label") and self.config.multi_label: return multi_label_eval_metrics(logits, labels, self.config.num_labels) elif self.config.num_labels == 1: return regression_eval_metrics(logits, labels) else: return classification_eval_metrics(logits, labels, self.config.num_labels)
[docs] def build_predictions(self, predict_output): """ Building prediction dict of the Text Classification Model Args: predict_output (`tuple`): (logits, _) Returns: ret_dict (`dict`): A dict with (`predictions`, `probabilities`, `logits`) """ if hasattr(self.config, "multi_label") and self.config.multi_label: return self._build_multi_label_predictions(predict_output) else: return self._build_single_label_predictions(predict_output)
def _build_single_label_predictions(self, predict_output): logits, _ = predict_output predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) probs = tf.nn.softmax(logits, axis=1) ret_dict = { "predictions": predictions, "probabilities": probs, "logits": logits, } return ret_dict def _build_multi_label_predictions(self, predict_output): logits, _ = predict_output probs = tf.sigmoid(logits) predictions = tf.cast(probs > 0.5, tf.int32) ret_dict = { "predictions": predictions, "probabilities": probs, "logits": logits, } return ret_dict
[docs]class BertTextClassify(BaseTextClassify): """ BERT Text Classification Model .. highlight:: python .. code-block:: python default_param_dict = { "pretrain_model_name_or_path": "pai-bert-base-zh", "multi_label": False, "num_labels": 2, "max_num_labels": 5, "dropout_rate": 0.1 } """ def __init__(self, **kwargs): super(BertTextClassify, self).__init__(**kwargs)
[docs] @staticmethod def get_input_tensor_schema(): return "input_ids:int:64,input_mask:int:64,segment_ids:int:64,label_id:int:1"
[docs] @staticmethod def get_received_tensor_schema(): return "input_ids:int:64,input_mask:int:64,segment_ids:int:64"
[docs] @staticmethod def default_model_params(): """ Get default model required parameters Returns: default_param_dict (`dict`): key/value pair of default model required parameters """ default_param_dict = { "pretrain_model_name_or_path": "pai-bert-base-zh", "multi_label": False, "num_labels": 2, "max_num_labels": 5, "dropout_rate": 0.1 } return default_param_dict
[docs] def build_logits(self, features, mode=None): """ Building graph of BERT Text Classifier Args: features (`OrderedDict`): A dict mapping raw input to tensors mode (`bool): tell the model whether it is under training Returns: logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels] label_ids (`Tensor`): label_ids, shape of [None] """ multi_label_flag = self.config.multi_label if hasattr(self.config, "multi_label") else False preprocessor = preprocessors.get_preprocessor(self.config.pretrain_model_name_or_path, multi_label=multi_label_flag, user_defined_config=self.config) input_ids, input_mask, segment_ids, labels = preprocessor(features) bert_backbone = model_zoo.get_pretrained_model(self.config.pretrain_model_name_or_path) _, pool_output = bert_backbone([input_ids, input_mask, segment_ids], mode=mode) is_training = (mode == tf.estimator.ModeKeys.TRAIN) pool_output = tf.layers.dropout( pool_output, rate=self.config.dropout_rate, training=is_training) logits = layers.Dense(self.config.num_labels, kernel_initializer=layers.get_initializer(0.02), name='app/ez_dense')(pool_output) self.check_and_init_from_checkpoint(mode) return logits, labels
[docs]class TextCNNClassify(BaseTextClassify): """ TextCNN Text Classification Model """ def __init__(self, **kwargs): super(TextCNNClassify, self).__init__(**kwargs) self.pre_build_vocab = self.config.mode.startswith("train")
[docs] @staticmethod def get_input_tensor_schema(): return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64,label_id:int:1"
[docs] @staticmethod def get_received_tensor_schema(): return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64"
[docs] @staticmethod def default_model_params(): """ Get default model required parameters Returns: default_param_dict (`dict`): key/value pair of default model required parameters """ default_param_dict = { "max_vocab_size": 30000, "embedding_size": 300, "num_filters": "100,100,100", "filter_sizes": "3,4,5", "dropout_rate": 0.5, "pretrain_word_embedding_name_or_path": "", "fix_embedding": False } return default_param_dict
[docs] def build_logits(self, features, mode=None): """ Building DAM text match graph Args: features (`OrderedDict`): A dict mapping raw input to tensors mode (`bool`): tell the model whether it is under training Returns: logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels] label_ids (`Tensor`): label_ids, shape of [None] """ text_preprocessor = DeepTextPreprocessor(self.config, mode=mode) text_indices, text_masks, _, _, label_ids = text_preprocessor(features) is_training = (mode == tf.estimator.ModeKeys.TRAIN) word_embeddings = self._add_word_embeddings(vocab_size=text_preprocessor.vocab.size, embed_size=self.config.embedding_size, pretrained_word_embeddings=text_preprocessor.pretrained_word_embeddings, trainable=not self.config.fix_embedding) text_embeds = tf.nn.embedding_lookup(word_embeddings, text_indices) output_features = layers.TextCNNEncoder(num_filters=self.config.num_filters, filter_sizes=self.config.filter_sizes, embed_size=self.config.embedding_size, max_seq_len=self.config.sequence_length, )([text_embeds, text_masks], training=is_training) output_features = tf.layers.dropout( output_features, rate=self.config.dropout_rate, training=is_training, name='output_features') logits = layers.Dense(self.config.num_labels, kernel_initializer=layers.get_initializer(0.02), name='output_layer')(output_features) self.check_and_init_from_checkpoint(mode) return logits, label_ids
def _add_word_embeddings(self, vocab_size, embed_size, pretrained_word_embeddings=None, trainable=False): with tf.name_scope("input_representations"): if pretrained_word_embeddings is not None: tf.logging.info("Initialize word embedding from pretrained") word_embedding_initializer = tf.constant_initializer(pretrained_word_embeddings) else: word_embedding_initializer = layers.get_initializer(0.02) word_embeddings = tf.get_variable("word_embeddings", [vocab_size, embed_size], dtype=tf.float32, initializer=word_embedding_initializer, trainable=trainable) return word_embeddings
[docs]class TextDGCNNClassify(BaseTextClassify): """ Text DGCNN Classification Model .. highlight:: python .. code-block:: python default_param_dict = { "pretrain_model_name_or_path": "pai-bert-base-zh", "multi_label": False, "num_labels": 2, "max_num_labels": 5, "dropout_rate": 0.1 } """ def __init__(self, **kwargs): super(TextDGCNNClassify, self).__init__(**kwargs)
[docs] @staticmethod def get_input_tensor_schema(): return "input_ids:int:64,input_mask:int:64,segment_ids:int:64,label_id:int:1"
[docs] @staticmethod def get_received_tensor_schema(): return "input_ids:int:64,input_mask:int:64,segment_ids:int:64"
[docs] @staticmethod def default_model_params(): """ Get default model required parameters Returns: default_param_dict (`dict`): key/value pair of default model required parameters """ default_param_dict = { "pretrain_model_name_or_path": "pai-bert-base-zh", "multi_label": False, "num_labels": 2, "max_num_labels": 5, "dropout_rate": 0.1 } return default_param_dict
[docs] def build_logits(self, features, mode=None): """ Building graph of BERT Text Classifier Args: features (`OrderedDict`): A dict mapping raw input to tensors mode (`bool): tell the model whether it is under training Returns: logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels] label_ids (`Tensor`): label_ids, shape of [None] """ multi_label_flag = self.config.multi_label if hasattr(self.config, "multi_label") else False preprocessor = preprocessors.get_preprocessor(self.config.pretrain_model_name_or_path, multi_label=multi_label_flag, user_defined_config=self.config) input_ids, input_mask, segment_ids, labels = preprocessor(features) dgcnn_backbone = DGCNNPreTrainedModel(self.config.pretrain_model_name_or_path) _, pooled_output = dgcnn_backbone(input_ids, mode=mode) dense = layers.Dense(self.config.num_labels, kernel_initializer=layers.get_initializer(0.02), name='dense') logits = dense(pooled_output) self.check_and_init_from_checkpoint(mode) return logits, labels
[docs]class ExtentableMultiTaskClassify(BaseTextClassify): def __init__(self, **kwargs): super(ExtentableMultiTaskClassify, self).__init__(**kwargs)
[docs] @staticmethod def get_input_tensor_schema(): return "input_ids:int:64,input_mask:int:64,segment_ids:int:64," \ "label_input_ids:int:64,label_input_mask:int:64,label_segment_ids:int:64," \ "task_ids:int:1,label_id:int:1"
[docs] @staticmethod def get_received_tensor_schema(): return "input_ids:int:64,input_mask:int:64,segment_ids:int:64," \ "task_ids:int:1"
[docs] @staticmethod def default_model_params(): """ Get default model required parameters Returns: default_param_dict (`dict`): key/value pair of default model required parameters """ default_param_dict = { "pretrain_model_name_or_path": "pai-bert-base-zh", "multi_label": False, "freeze_encoder": False, "max_task_num": 100, "max_label_num": 30, "task_column_name": "task", "reset_task_ids": "" } return default_param_dict
[docs] def build_logits(self, features, mode=None): preprocessor = preprocessors.get_preprocessor(self.config.pretrain_model_name_or_path, user_defined_config=self.config, app_model_name="text_classify_bert_emtl") self.max_task_num = preprocessor.max_task_num self.max_label_num = preprocessor.max_label_num self.config.num_labels = self.max_label_num bert_backbone = model_zoo.get_pretrained_model(self.config.pretrain_model_name_or_path) input_ids, input_mask, segment_ids, _, _, _, task_idx, label_ids = preprocessor(features) if len(task_idx.shape) > 1: task_idx = tf.squeeze(task_idx, axis=1) self.input_ids, self.input_mask, self.segment_ids = input_ids, input_mask, segment_ids # [batch_size, hidden_size] self.sequence_output, pool_output = bert_backbone( [input_ids, input_mask, segment_ids], mode=mode) multi_task_kernel = tf.get_variable( name="output/multi_task_kernel", shape=[self.max_task_num, self.max_label_num, bert_backbone.config.hidden_size], initializer=layers.get_initializer(0.02)) multi_task_bias = tf.get_variable( name="output/multi_task_bias", shape=[self.max_task_num, self.max_label_num], initializer=tf.initializers.zeros) self.selective_init_from_checkpoint_without_training_ops(mode) # [batch_size, max_label_size, hidden_size] batch_kernel = tf.gather(multi_task_kernel, task_idx) # [batch_size, max_label_size] batch_bias = tf.gather(multi_task_bias, task_idx) # [batch_size, 1, hidden_size] x [batch_size, hidden_size, max_label_size] # = [batch_size, 1, max_label_size] logits = tf.matmul(tf.expand_dims(pool_output, axis=1), tf.transpose(batch_kernel, [0, 2, 1])) logits = tf.squeeze(logits, axis=1) + batch_bias if hasattr(self.config, "freeze_encoder") and self.config.freeze_encoder: self.tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "output/") return logits, label_ids
[docs] def selective_init_from_checkpoint_without_training_ops(self, mode): init_from_ckpt = hasattr(self.config, "init_checkpoint_path") and self.config.init_checkpoint_path \ and mode == tf.estimator.ModeKeys.TRAIN if not init_from_ckpt: return import re from tensorflow.python import pywrap_tensorflow from tensorflow.python.framework import errors_impl pretrained_model_path = self.config.init_checkpoint_path if self.config.reset_task_ids: reset_task_ids = [[int(t)] for t in self.config.reset_task_ids.split(",")] else: reset_task_ids = None tvars = tf.trainable_variables() network_name_to_variable = {} for var in tvars: name = var.name m = re.match("^(.*):\\d+$", name) if m is not None: name = m.group(1) network_name_to_variable[name] = var try: reader = pywrap_tensorflow.NewCheckpointReader(pretrained_model_path) var_to_shape_map = reader.get_variable_to_shape_map() except errors_impl.DataLossError: raise ImportError( '`load_weights` requires correct tf ckpts.') assignment_map = {} for key in var_to_shape_map: if "Adam" in key or "beta1_power" in key or "beta2_power" in key: continue if "global_step" in key: continue var = None if "pre_trained_model" in key: root_key = key.replace(key.split("/")[0] + "/", "") else: root_key = key for network_key in network_name_to_variable.keys(): if root_key in network_key: var = network_name_to_variable[network_key] break if var is None: print("Variable: {} in ckpt not in trainable variable".format(key)) continue # raise ValueError("ckpt var name {} not in trainable variable".format(key)) if "multi_task_kernel" in key and reset_task_ids: # TODO: reset kernels of task in reset_task_ids pass if "multi_task_bias" in key and reset_task_ids: # TODO: reset bias of task in reset_task_ids pass assignment_map[key] = var tf.logging.info("Load weights from {}".format(pretrained_model_path)) tf.train.init_from_checkpoint(pretrained_model_path, assignment_map)