Source code for easytransfer.app_zoo.text_classify

# coding=utf-8
# Copyright (c) 2019 Alibaba PAI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import tensorflow as tf
from easytransfer import preprocessors, model_zoo
from easytransfer.app_zoo.base import ApplicationModel
from easytransfer.evaluators import classification_eval_metrics, multi_label_eval_metrics, regression_eval_metrics
import easytransfer.layers as layers
from easytransfer.losses import mean_square_error, multi_label_sigmoid_cross_entropy, softmax_cross_entropy
from easytransfer.preprocessors.deeptext_preprocessor import DeepTextPreprocessor
from easytransfer.model_zoo.modeling_dgcnn import DGCNNPreTrainedModel


[docs]class BaseTextClassify(ApplicationModel):
    def __init__(self, **kwargs):
        """ Basic Text Classification Model """
        super(BaseTextClassify, self).__init__(**kwargs)

[docs]    @staticmethod
    def default_model_params():
        """ The default value of the Text Classification Model """
        raise NotImplementedError

[docs]    def build_logits(self, features, mode=None):
        """ Building graph of the Text Classification Model
        """
        raise NotImplementedError

[docs]    def build_loss(self, logits, labels):
        """ Building loss for training the Text Classification Model
        """
        if hasattr(self.config, "multi_label") and self.config.multi_label:
            return multi_label_sigmoid_cross_entropy(labels, self.config.num_labels, logits)
        elif self.config.num_labels == 1:
            return mean_square_error(labels, logits)
        else:
            return softmax_cross_entropy(labels, self.config.num_labels, logits)

[docs]    def build_eval_metrics(self, logits, labels):
        """ Building evaluation metrics while evaluating

        Args:
            logits (`Tensor`): shape of [None, num_labels]
            labels (`Tensor`): shape of [None]
        Returns:
            ret_dict (`dict`): A dict with (`py_accuracy`, `py_micro_f1`, `py_macro_f1`) tf.metrics op
        """
        if hasattr(self.config, "multi_label") and self.config.multi_label:
            return multi_label_eval_metrics(logits, labels, self.config.num_labels)
        elif self.config.num_labels == 1:
            return regression_eval_metrics(logits, labels)
        else:
            return classification_eval_metrics(logits, labels, self.config.num_labels)

[docs]    def build_predictions(self, predict_output):
        """ Building  prediction dict of the Text Classification Model

        Args:
            predict_output (`tuple`): (logits, _)
        Returns:
            ret_dict (`dict`): A dict with (`predictions`, `probabilities`, `logits`)
        """
        if hasattr(self.config, "multi_label") and self.config.multi_label:
            return self._build_multi_label_predictions(predict_output)
        else:
            return self._build_single_label_predictions(predict_output)


    def _build_single_label_predictions(self, predict_output):
        logits, _ = predict_output
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        probs = tf.nn.softmax(logits, axis=1)
        ret_dict = {
            "predictions": predictions,
            "probabilities": probs,
            "logits": logits,
        }
        return ret_dict

    def _build_multi_label_predictions(self, predict_output):
        logits, _ = predict_output
        probs = tf.sigmoid(logits)
        predictions = tf.cast(probs > 0.5, tf.int32)
        ret_dict = {
            "predictions": predictions,
            "probabilities": probs,
            "logits": logits,
        }
        return ret_dict


[docs]class BertTextClassify(BaseTextClassify):
    """ BERT Text Classification Model

        .. highlight:: python
        .. code-block:: python

            default_param_dict = {
                "pretrain_model_name_or_path": "pai-bert-base-zh",
                "multi_label": False,
                "num_labels": 2,
                "max_num_labels": 5,
                "dropout_rate": 0.1
            }
    """
    def __init__(self, **kwargs):
        super(BertTextClassify, self).__init__(**kwargs)

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids:int:64,input_mask:int:64,segment_ids:int:64,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids:int:64,input_mask:int:64,segment_ids:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "pretrain_model_name_or_path": "pai-bert-base-zh",
            "multi_label": False,
            "num_labels": 2,
            "max_num_labels": 5,
            "dropout_rate": 0.1
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building graph of BERT Text Classifier

        Args:
            features (`OrderedDict`): A dict mapping raw input to tensors
            mode (`bool): tell the model whether it is under training
        Returns:
            logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
            label_ids (`Tensor`): label_ids, shape of [None]
        """
        multi_label_flag = self.config.multi_label if hasattr(self.config, "multi_label") else False
        preprocessor = preprocessors.get_preprocessor(self.config.pretrain_model_name_or_path,
                                                      multi_label=multi_label_flag,
                                                      user_defined_config=self.config)
        input_ids, input_mask, segment_ids, labels = preprocessor(features)

        bert_backbone = model_zoo.get_pretrained_model(self.config.pretrain_model_name_or_path)
        _, pool_output = bert_backbone([input_ids, input_mask, segment_ids], mode=mode)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        pool_output = tf.layers.dropout(
            pool_output, rate=self.config.dropout_rate, training=is_training)
        logits = layers.Dense(self.config.num_labels,
                              kernel_initializer=layers.get_initializer(0.02),
                              name='app/ez_dense')(pool_output)

        self.check_and_init_from_checkpoint(mode)
        return logits, labels


[docs]class TextCNNClassify(BaseTextClassify):
    """ TextCNN Text Classification Model """
    def __init__(self, **kwargs):
        super(TextCNNClassify, self).__init__(**kwargs)
        self.pre_build_vocab = self.config.mode.startswith("train")

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "max_vocab_size": 30000,
            "embedding_size": 300,
            "num_filters": "100,100,100",
            "filter_sizes": "3,4,5",
            "dropout_rate": 0.5,
            "pretrain_word_embedding_name_or_path": "",
            "fix_embedding": False
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building DAM text match graph

        Args:
            features (`OrderedDict`): A dict mapping raw input to tensors
            mode (`bool`): tell the model whether it is under training
        Returns:
            logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
            label_ids (`Tensor`): label_ids, shape of [None]
        """
        text_preprocessor = DeepTextPreprocessor(self.config, mode=mode)
        text_indices, text_masks, _, _, label_ids = text_preprocessor(features)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        word_embeddings = self._add_word_embeddings(vocab_size=text_preprocessor.vocab.size,
                                                    embed_size=self.config.embedding_size,
                                                    pretrained_word_embeddings=text_preprocessor.pretrained_word_embeddings,
                                                    trainable=not self.config.fix_embedding)
        text_embeds = tf.nn.embedding_lookup(word_embeddings, text_indices)

        output_features = layers.TextCNNEncoder(num_filters=self.config.num_filters,
                                                filter_sizes=self.config.filter_sizes,
                                                embed_size=self.config.embedding_size,
                                                max_seq_len=self.config.sequence_length,
                                                )([text_embeds, text_masks], training=is_training)

        output_features = tf.layers.dropout(
            output_features, rate=self.config.dropout_rate, training=is_training, name='output_features')

        logits = layers.Dense(self.config.num_labels,
                              kernel_initializer=layers.get_initializer(0.02),
                              name='output_layer')(output_features)

        self.check_and_init_from_checkpoint(mode)
        return logits, label_ids

    def _add_word_embeddings(self, vocab_size, embed_size, pretrained_word_embeddings=None, trainable=False):
        with tf.name_scope("input_representations"):
            if pretrained_word_embeddings is not None:
                tf.logging.info("Initialize word embedding from pretrained")
                word_embedding_initializer = tf.constant_initializer(pretrained_word_embeddings)
            else:
                word_embedding_initializer = layers.get_initializer(0.02)
            word_embeddings = tf.get_variable("word_embeddings",
                                              [vocab_size, embed_size],
                                              dtype=tf.float32, initializer=word_embedding_initializer,
                                              trainable=trainable)
        return word_embeddings


[docs]class TextDGCNNClassify(BaseTextClassify):
    """ Text DGCNN Classification Model

        .. highlight:: python
        .. code-block:: python

            default_param_dict = {
                "pretrain_model_name_or_path": "pai-bert-base-zh",
                "multi_label": False,
                "num_labels": 2,
                "max_num_labels": 5,
                "dropout_rate": 0.1
            }
    """
    def __init__(self, **kwargs):
        super(TextDGCNNClassify, self).__init__(**kwargs)

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids:int:64,input_mask:int:64,segment_ids:int:64,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids:int:64,input_mask:int:64,segment_ids:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "pretrain_model_name_or_path": "pai-bert-base-zh",
            "multi_label": False,
            "num_labels": 2,
            "max_num_labels": 5,
            "dropout_rate": 0.1
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building graph of BERT Text Classifier

        Args:
            features (`OrderedDict`): A dict mapping raw input to tensors
            mode (`bool): tell the model whether it is under training
        Returns:
            logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
            label_ids (`Tensor`): label_ids, shape of [None]
        """
        multi_label_flag = self.config.multi_label if hasattr(self.config, "multi_label") else False
        preprocessor = preprocessors.get_preprocessor(self.config.pretrain_model_name_or_path,
                                                      multi_label=multi_label_flag,
                                                      user_defined_config=self.config)
        input_ids, input_mask, segment_ids, labels = preprocessor(features)

        dgcnn_backbone = DGCNNPreTrainedModel(self.config.pretrain_model_name_or_path)

        _, pooled_output = dgcnn_backbone(input_ids, mode=mode)

        dense = layers.Dense(self.config.num_labels,
                             kernel_initializer=layers.get_initializer(0.02),
                             name='dense')
        logits = dense(pooled_output)

        self.check_and_init_from_checkpoint(mode)
        return logits, labels


[docs]class ExtentableMultiTaskClassify(BaseTextClassify):
    def __init__(self, **kwargs):
        super(ExtentableMultiTaskClassify, self).__init__(**kwargs)

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids:int:64,input_mask:int:64,segment_ids:int:64," \
               "label_input_ids:int:64,label_input_mask:int:64,label_segment_ids:int:64," \
               "task_ids:int:1,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids:int:64,input_mask:int:64,segment_ids:int:64," \
               "task_ids:int:1"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "pretrain_model_name_or_path": "pai-bert-base-zh",
            "multi_label": False,
            "freeze_encoder": False,
            "max_task_num": 100,
            "max_label_num": 30,
            "task_column_name": "task",
            "reset_task_ids": ""
        }
        return default_param_dict


[docs]    def build_logits(self, features, mode=None):
        preprocessor = preprocessors.get_preprocessor(self.config.pretrain_model_name_or_path,
                                                      user_defined_config=self.config,
                                                      app_model_name="text_classify_bert_emtl")

        self.max_task_num = preprocessor.max_task_num
        self.max_label_num = preprocessor.max_label_num
        self.config.num_labels = self.max_label_num
        bert_backbone = model_zoo.get_pretrained_model(self.config.pretrain_model_name_or_path)

        input_ids, input_mask, segment_ids, _, _, _, task_idx, label_ids = preprocessor(features)

        if len(task_idx.shape) > 1:
            task_idx = tf.squeeze(task_idx, axis=1)

        self.input_ids, self.input_mask, self.segment_ids = input_ids, input_mask, segment_ids
        # [batch_size, hidden_size]
        self.sequence_output, pool_output = bert_backbone(
            [input_ids, input_mask, segment_ids], mode=mode)

        multi_task_kernel = tf.get_variable(
            name="output/multi_task_kernel",
            shape=[self.max_task_num, self.max_label_num, bert_backbone.config.hidden_size],
            initializer=layers.get_initializer(0.02))

        multi_task_bias = tf.get_variable(
            name="output/multi_task_bias",
            shape=[self.max_task_num, self.max_label_num],
            initializer=tf.initializers.zeros)

        self.selective_init_from_checkpoint_without_training_ops(mode)

        # [batch_size, max_label_size, hidden_size]
        batch_kernel = tf.gather(multi_task_kernel, task_idx)
        # [batch_size, max_label_size]
        batch_bias = tf.gather(multi_task_bias, task_idx)

        # [batch_size, 1, hidden_size] x [batch_size, hidden_size, max_label_size]
        # = [batch_size, 1, max_label_size]
        logits = tf.matmul(tf.expand_dims(pool_output, axis=1),
                           tf.transpose(batch_kernel, [0, 2, 1]))
        logits = tf.squeeze(logits, axis=1) + batch_bias

        if hasattr(self.config, "freeze_encoder") and self.config.freeze_encoder:
            self.tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           "output/")

        return logits, label_ids

[docs]    def selective_init_from_checkpoint_without_training_ops(self, mode):
        init_from_ckpt = hasattr(self.config, "init_checkpoint_path") and self.config.init_checkpoint_path \
                and mode == tf.estimator.ModeKeys.TRAIN
        if not init_from_ckpt:
            return

        import re
        from tensorflow.python import pywrap_tensorflow
        from tensorflow.python.framework import errors_impl

        pretrained_model_path = self.config.init_checkpoint_path
        if self.config.reset_task_ids:
            reset_task_ids = [[int(t)] for t in self.config.reset_task_ids.split(",")]
        else:
            reset_task_ids = None

        tvars = tf.trainable_variables()
        network_name_to_variable = {}
        for var in tvars:
            name = var.name
            m = re.match("^(.*):\\d+$", name)
            if m is not None:
                name = m.group(1)
            network_name_to_variable[name] = var

        try:
            reader = pywrap_tensorflow.NewCheckpointReader(pretrained_model_path)
            var_to_shape_map = reader.get_variable_to_shape_map()
        except errors_impl.DataLossError:
            raise ImportError(
                '`load_weights` requires correct tf ckpts.')

        assignment_map = {}
        for key in var_to_shape_map:
            if "Adam" in key or "beta1_power" in key or "beta2_power" in key:
                continue
            if "global_step" in key:
                continue

            var = None
            if "pre_trained_model" in key:
                root_key = key.replace(key.split("/")[0] + "/", "")
            else:
                root_key = key

            for network_key in network_name_to_variable.keys():
                if root_key in network_key:
                    var = network_name_to_variable[network_key]
                    break
            if var is None:
                print("Variable: {} in ckpt not in trainable variable".format(key))
                continue
                # raise ValueError("ckpt var name {} not in trainable variable".format(key))

            if "multi_task_kernel" in key and reset_task_ids:
                # TODO: reset kernels of task in reset_task_ids
                pass

            if "multi_task_bias" in key and reset_task_ids:
                # TODO: reset bias of task in reset_task_ids
                pass

            assignment_map[key] = var
        tf.logging.info("Load weights from {}".format(pretrained_model_path))
        tf.train.init_from_checkpoint(pretrained_model_path, assignment_map)