Source code for easytransfer.preprocessors.classification_regression_preprocessor

# coding=utf-8
# Copyright (c) 2019 Alibaba PAI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from collections import OrderedDict
import json
import numpy as np
import tensorflow as tf
from .preprocessor import Preprocessor, PreprocessorConfig, truncate_seq_pair
from .tokenization import convert_to_unicode


[docs]class ClassificationRegressionPreprocessorConfig(PreprocessorConfig):
    def __init__(self, **kwargs):
        super(ClassificationRegressionPreprocessorConfig, self).__init__(**kwargs)

        self.input_schema = kwargs.get("input_schema")
        self.output_schema = kwargs.get("output_schema", None)
        self.sequence_length = kwargs.get("sequence_length")
        self.first_sequence = kwargs.get("first_sequence")
        self.second_sequence = kwargs.get("second_sequence")
        self.label_name = kwargs.get("label_name")
        self.label_enumerate_values = kwargs.get("label_enumerate_values")


[docs]class ClassificationRegressionPreprocessor(Preprocessor):
    """ Preprocessor for classification/regression task

    """
    config_class = ClassificationRegressionPreprocessorConfig

    def __init__(self, config, **kwargs):
        Preprocessor.__init__(self, config, **kwargs)
        self.config = config

        self.input_tensor_names = []
        for schema in config.input_schema.split(","):
            name = schema.split(":")[0]
            self.input_tensor_names.append(name)

        self.label_idx_map = OrderedDict()
        if self.config.label_enumerate_values is not None:
            for (i, label) in enumerate(self.config.label_enumerate_values.split(",")):
                self.label_idx_map[convert_to_unicode(label)] = i

        if hasattr(self.config, "multi_label") and self.config.multi_label is True:
            self.multi_label = True
            self.max_num_labels = self.config.max_num_labels if hasattr(self.config, "max_num_labels") else 5
        else:
            self.multi_label = False
            self.max_num_labels = None

[docs]    def set_feature_schema(self):
        if self.mode.startswith("predict") or self.mode == "preprocess":
            self.output_schema = self.config.output_schema
        self.output_tensor_names = ["input_ids", "input_mask", "segment_ids", "label_id"]
        if self.multi_label:
            self.seq_lens = [self.config.sequence_length] * 3 + [self.max_num_labels]
            self.feature_value_types = [tf.int64] * 3 + [tf.int64]
        else:
            self.seq_lens = [self.config.sequence_length] * 3 + [1]
            if len(self.label_idx_map) >= 2:
                self.feature_value_types = [tf.int64] * 4
            else:
                self.feature_value_types = [tf.int64] * 3 + [tf.float32]

[docs]    def convert_example_to_features(self, items):
        """ Convert single example to classifcation/regression features

        Args:
            items (`dict`): inputs from the reader
        Returns:
            features (`tuple`): (input_ids, input_mask, segment_ids, label_id)
        """
        text_a = items[self.input_tensor_names.index(self.config.first_sequence)]
        tokens_a = self.config.tokenizer.tokenize(convert_to_unicode(text_a))
        if self.config.second_sequence in self.input_tensor_names:
            text_b = items[self.input_tensor_names.index(self.config.second_sequence)]
            tokens_b = self.config.tokenizer.tokenize(convert_to_unicode(text_b))
            truncate_seq_pair(tokens_a, tokens_b, self.config.sequence_length - 3)
        else:
            if len(tokens_a) > self.config.sequence_length - 2:
                tokens_a = tokens_a[0:(self.config.sequence_length - 2)]
            tokens_b = None

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

        input_ids = self.config.tokenizer.convert_tokens_to_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < self.config.sequence_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == self.config.sequence_length
        assert len(input_mask) == self.config.sequence_length
        assert len(segment_ids) == self.config.sequence_length

        if self.config.label_name is not None:
            label_value = items[self.input_tensor_names.index(self.config.label_name)]
            if isinstance(label_value, str) or isinstance(label_value, bytes):
                label = convert_to_unicode(label_value)
            else:
                label = str(label_value)

            if self.multi_label:
                label_ids = [self.label_idx_map[convert_to_unicode(x)] for x in label.split(",") if x]
                label_ids = label_ids[:self.max_num_labels]
                label_ids = label_ids + [-1 for _ in range(self.max_num_labels - len(label_ids))]
                label_ids = [str(t) for t in label_ids]
                label_id = ' '.join(label_ids)
            elif len(self.label_idx_map) >= 2:
                label_id = str(self.label_idx_map[convert_to_unicode(label)])
            else:
                label_id = label

        else:
            label_id = '0'

        return ' '.join([str(t) for t in input_ids]), \
               ' '.join([str(t) for t in input_mask]), \
               ' '.join([str(t) for t in segment_ids]), label_id


[docs]class PairedClassificationRegressionPreprocessor(ClassificationRegressionPreprocessor):
    """ Preprocessor for paired classification/regression task

    """
    config_class = ClassificationRegressionPreprocessorConfig

    def __init__(self, config, **kwargs):
        super(PairedClassificationRegressionPreprocessor, self).__init__(config, **kwargs)

[docs]    def set_feature_schema(self):
        if self.mode.startswith("predict") or self.mode == "preprocess":
            self.output_schema = self.config.output_schema
        #self.output_tensor_names = ["input_ids", "input_mask", "segment_ids", "label_id"]
        self.output_tensor_names = ["input_ids_a", "input_mask_a", "segment_ids_a",
                                    "input_ids_b", "input_mask_b", "segment_ids_b",
                                    "label_id"]
        self.seq_lens = [self.config.sequence_length] * 6 + [1]
        if len(self.label_idx_map) >= 2:
            self.feature_value_types = [tf.int64] * 6 + [tf.int64]
        else:
            self.feature_value_types = [tf.int64] * 6 + [tf.float32]

[docs]    def convert_example_to_features(self, items):
        """ Convert single example to classifcation/regression features

        Args:
            items (`dict`): inputs from the reader
        Returns:
            features (`tuple`): (input_ids_a, input_mask_a, segment_ids_a,
                                 input_ids_b, input_mask_b, segment_ids_b,
                                 label_id)
        """
        assert self.config.first_sequence in self.input_tensor_names \
               and self.config.second_sequence in self.input_tensor_names
        text_a = items[self.input_tensor_names.index(self.config.first_sequence)]
        tokens_a = self.config.tokenizer.tokenize(convert_to_unicode(text_a))

        text_b = items[self.input_tensor_names.index(self.config.second_sequence)]
        tokens_b = self.config.tokenizer.tokenize(convert_to_unicode(text_b))

        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > self.config.sequence_length - 2:
            tokens_a = tokens_a[0:(self.config.sequence_length - 2)]

        if len(tokens_b) > self.config.sequence_length - 2:
            tokens_b = tokens_b[0:(self.config.sequence_length - 2)]

        tokens = []
        segment_ids_a = []
        tokens.append("[CLS]")
        segment_ids_a.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids_a.append(0)
        tokens.append("[SEP]")
        segment_ids_a.append(0)

        input_ids_a = self.config.tokenizer.convert_tokens_to_ids(tokens)

        tokens = []
        segment_ids_b = []
        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids_b.append(1)
            tokens.append("[SEP]")
            segment_ids_b.append(1)

        input_ids_b = self.config.tokenizer.convert_tokens_to_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask_a = [1] * len(input_ids_a)
        input_mask_b = [1] * len(input_ids_b)

        # Zero-pad up to the sequence length.
        while len(input_ids_a) < self.config.sequence_length:
            input_ids_a.append(0)
            input_mask_a.append(0)
            segment_ids_a.append(0)

        # Zero-pad up to the sequence length.
        while len(input_ids_b) < self.config.sequence_length:
            input_ids_b.append(0)
            input_mask_b.append(0)
            segment_ids_b.append(0)

        assert len(input_ids_a) == self.config.sequence_length
        assert len(input_mask_a) == self.config.sequence_length
        assert len(segment_ids_a) == self.config.sequence_length
        assert len(input_ids_b) == self.config.sequence_length
        assert len(input_mask_b) == self.config.sequence_length
        assert len(segment_ids_b) == self.config.sequence_length

        # support single/multi classification and regression
        if self.config.label_name is not None:

            label_value = items[self.input_tensor_names.index(self.config.label_name)]
            if isinstance(label_value, str) or isinstance(label_value, bytes):
                label = convert_to_unicode(label_value)
            else:
                label = str(label_value)

            if len(self.label_idx_map) >= 2:
                label_id = str(self.label_idx_map[convert_to_unicode(label)])
            else:
                label_id = label
        else:
            label_id = '0'
        return ' '.join([str(t) for t in input_ids_a]), \
               ' '.join([str(t) for t in input_mask_a]), \
               ' '.join([str(t) for t in segment_ids_a]), \
               ' '.join([str(t) for t in input_ids_b]), \
               ' '.join([str(t) for t in input_mask_b]), \
               ' '.join([str(t) for t in segment_ids_b]), label_id


[docs]class MultiTaskClassificationPreprocessorConfig(PreprocessorConfig):
    def __init__(self, **kwargs):
        super(MultiTaskClassificationPreprocessorConfig, self).__init__(**kwargs)

        self.input_schema = kwargs.get("input_schema")
        self.output_schema = kwargs.get("output_schema", None)
        self.first_sequence = kwargs.get("first_sequence")
        self.second_sequence = kwargs.get("second_sequence")
        self.label_name = kwargs.get("label_name")
        self.label_meta_info_path = kwargs.get("label_enumerate_values")

        self.task = kwargs.get("task_column_name")
        self.sequence_length = kwargs.get("sequence_length")
        self.max_label_length = kwargs.get("max_label_length", 10)
        self.max_task_num = kwargs.get("max_task_num", None)
        self.max_label_num = kwargs.get("max_label_num", None)


[docs]class MultiTaskClassificationPreprocessor(Preprocessor):
    """ Preprocessor for classification/regression task

    """
    config_class = MultiTaskClassificationPreprocessorConfig

    def __init__(self, config, **kwargs):
        Preprocessor.__init__(self, config, **kwargs)
        self.config = config

        self.input_tensor_names = []
        for schema in config.input_schema.split(","):
            name = schema.split(":")[0]
            self.input_tensor_names.append(name)

        with tf.gfile.Open(config.label_meta_info_path.strip("^")) as f:
            label_meta_info = json.load(f)

        self.task_id_to_idx = dict()
        self.task_id_to_label_mapping = dict()
        self.task_id_to_label_features = dict()

        self.max_task_num = self.config.max_task_num if self.config.max_task_num else len(label_meta_info)
        self.max_label_num = self.config.max_label_num if self.config.max_label_num else \
            max([len(t["labelMap"]) for t in label_meta_info])

        self.max_label_length = self.config.max_label_length
        for task_label_info in label_meta_info:
            task_idx = task_label_info["taskIndex"]
            labels = task_label_info["labelMap"]
            label_map = {label: idx for idx, label in enumerate(labels)}
            task_key = task_label_info["taskKey"]
            self.task_id_to_idx[task_key] = task_idx
            self.task_id_to_label_mapping[task_key] = label_map

            label_tokens = ['[CLS]']
            label_segment_ids = [0]
            label_input_mask = [1]
            for label in labels:
                sub_tokens = self.config.tokenizer.tokenize(convert_to_unicode(label))
                tmp_label_tokens = list()
                tmp_segment_ids = list()
                tmp_input_mask = list()
                for sub_tok in sub_tokens[:self.config.max_label_length]:
                    tmp_label_tokens.append(sub_tok)
                    tmp_segment_ids.append(0)
                    tmp_input_mask.append(1)
                tmp_label_tokens.append('[SEP]')
                tmp_segment_ids.append(0)
                tmp_input_mask.append(1)
                while len(tmp_label_tokens) < self.config.max_label_length + 1:
                    tmp_label_tokens.append('[PAD]')
                    tmp_segment_ids.append(0)
                    tmp_input_mask.append(0)
                label_tokens.extend(tmp_label_tokens)
                label_segment_ids.extend(tmp_segment_ids)
                label_input_mask.extend(tmp_input_mask)

            label_input_ids = self.config.tokenizer.convert_tokens_to_ids(label_tokens)

            while len(label_input_ids) < 1 + self.max_label_num * (self.config.max_label_length + 1):
                label_input_ids.append(0)
                label_input_mask.append(0)
                label_segment_ids.append(0)
            self.task_id_to_label_features[task_key] = (label_input_ids, label_input_mask, label_segment_ids)


[docs]    def set_feature_schema(self):
        if self.mode.startswith("predict") or self.mode == "preprocess":
            self.output_schema = self.config.output_schema
        self.output_tensor_names = ["input_ids", "input_mask", "segment_ids",
                                    "label_input_ids", "label_input_mask", "label_segment_ids",
                                    "task_ids", "label_ids"]
        self.seq_lens = [self.config.sequence_length] * 3 + \
                        [1 + self.max_label_num * (self.config.max_label_length + 1)] * 3 + \
                        [1, 1]
        self.feature_value_types = [tf.int64] * 8

[docs]    def convert_example_to_features(self, items):
        """ Convert single example to classifcation/regression features

        Args:
            items (`dict`): inputs from the reader
        Returns:
            features (`tuple`): (input_ids, input_mask, segment_ids, label_id)
        """
        text_a = items[self.input_tensor_names.index(self.config.first_sequence)]
        tokens_a = self.config.tokenizer.tokenize(convert_to_unicode(text_a))
        if self.config.second_sequence in self.input_tensor_names:
            text_b = items[self.input_tensor_names.index(self.config.second_sequence)]
            tokens_b = self.config.tokenizer.tokenize(convert_to_unicode(text_b))
            truncate_seq_pair(tokens_a, tokens_b, self.config.sequence_length - 3)
        else:
            if len(tokens_a) > self.config.sequence_length - 2:
                tokens_a = tokens_a[0:(self.config.sequence_length - 2)]
            tokens_b = None

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

        input_ids = self.config.tokenizer.convert_tokens_to_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < self.config.sequence_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == self.config.sequence_length
        assert len(input_mask) == self.config.sequence_length
        assert len(segment_ids) == self.config.sequence_length

        task_key = items[self.input_tensor_names.index(self.config.task)]
        task_idx = self.task_id_to_idx[task_key]
        label_input_ids, label_input_mask, label_segment_ids = self.task_id_to_label_features[task_key]

        if self.config.label_name is not None:
            label_value = items[self.input_tensor_names.index(self.config.label_name)]
            if isinstance(label_value, str) or isinstance(label_value, bytes):
                label = convert_to_unicode(label_value)
            else:
                label = str(label_value)

            label_id = str(self.task_id_to_label_mapping[task_key][label])

        else:
            label_id = '0'

        return ' '.join([str(t) for t in input_ids]), \
               ' '.join([str(t) for t in input_mask]), \
               ' '.join([str(t) for t in segment_ids]), \
               ' '.join([str(t) for t in label_input_ids]), \
               ' '.join([str(t) for t in label_input_mask]), \
               ' '.join([str(t) for t in label_segment_ids]), \
               task_idx, label_id

[docs]    def process(self, inputs):
        ret = super(MultiTaskClassificationPreprocessor, self).process(inputs)
        if len(ret['task_ids'].shape) > 1:
            ret['task_ids'] = np.squeeze(ret['task_ids'], axis=-1)
        return ret