Source code for ckip_transformers.nlp.driver

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module implements the CKIP Transformers NLP drivers.
"""

__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2020 CKIP Lab'
__license__ = 'GPL-3.0'

from typing import (
    List,
)

import numpy as np

from .util import (
    CkipTokenClassification,
    NerToken,
)

################################################################################################################################

[docs]class CkipWordSegmenter(CkipTokenClassification):
    """The word segmentation driver.

        Parameters
        ----------
            level : ``str`` *optional*, defaults to 3, must be 1—3
                The model level. The higher the level is, the more accurate and slower the model is.
            device : ``int``, *optional*, defaults to -1,
                Device ordinal for CPU/GPU supports.
                Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
    """

    _model_names = {
        1: 'ckiplab/albert-tiny-chinese-ws',
        2: 'ckiplab/albert-base-chinese-ws',
        3: 'ckiplab/bert-base-chinese-ws',
    }

    def __init__(self,
        level: int = 3,
        **kwargs,
    ):
        model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
        super().__init__(model_name=model_name, **kwargs)

[docs]    def __call__(self,
        input_text: List[str],
        *,
        use_delim: bool = False,
        **kwargs,
    ) -> List[List[str]]:
        """Call the driver.

        Parameters
        ----------
            input_text : ``List[str]``
                The input sentences. Each sentence is a string.
            use_delim : ``bool``, *optional*, defaults to False
                Segment sentence (internally) using ``delim_set``.
            delim_set : `str`, *optional*, defaults to ``'，,。：:；;！!？?'``
                Used for sentence segmentation if ``use_delim=True``.
            batch_size : ``int``, *optional*, defaults to 256
                The size of mini-batch.
            max_length : ``int``, *optional*
                The maximum length of the sentence,
                must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
            show_progress : ``int``, *optional*, defaults to True
                Show progress bar.

        Returns
        -------
            ``List[List[NerToken]]``
                A list of list of words (``str``).
        """

        # Call model
        (
            logits,
            index_map,
        ) = super().__call__(input_text, use_delim=use_delim, **kwargs)

        # Post-process results
        output_text = []
        for sent_data in zip(input_text, index_map):
            output_sent = []
            word = ''
            for input_char, logits_index in zip(*sent_data):
                if logits_index is None:
                    if word:
                        output_sent.append(word)
                    output_sent.append(input_char)
                    word = ''
                else:
                    logits_b, logits_i = logits[logits_index]

                    if logits_b > logits_i:
                        if word:
                            output_sent.append(word)
                        word = input_char
                    else:
                        word += input_char

            if word:
                output_sent.append(word)
            output_text.append(output_sent)

        return output_text

################################################################################################################################

[docs]class CkipPosTagger(CkipTokenClassification):
    """The part-of-speech tagging driver.

        Parameters
        ----------
            level : ``str`` *optional*, defaults to 3, must be 1—3
                The model level. The higher the level is, the more accurate and slower the model is.
            device : ``int``, *optional*, defaults to -1,
                Device ordinal for CPU/GPU supports.
                Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
    """

    _model_names = {
        1: 'ckiplab/albert-tiny-chinese-pos',
        2: 'ckiplab/albert-base-chinese-pos',
        3: 'ckiplab/bert-base-chinese-pos',
    }

    def __init__(self,
        level: int = 3,
        **kwargs,
    ):
        model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
        super().__init__(model_name=model_name, **kwargs)

[docs]    def __call__(self,
        input_text: List[List[str]],
        *,
        use_delim: bool = True,
        **kwargs,
    ) -> List[List[str]]:
        """Call the driver.

        Parameters
        ----------
            input_text : ``List[List[str]]``
                The input sentences. Each sentence is a list of strings (words).
            use_delim : ``bool``, *optional*, defaults to True
                Segment sentence (internally) using ``delim_set``.
            delim_set : `str`, *optional*, defaults to ``'，,。：:；;！!？?'``
                Used for sentence segmentation if ``use_delim=True``.
            batch_size : ``int``, *optional*, defaults to 256
                The size of mini-batch.
            max_length : ``int``, *optional*
                The maximum length of the sentence,
                must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
            show_progress : ``int``, *optional*, defaults to True
                Show progress bar.

        Returns
        -------
            ``List[List[NerToken]]``
                A list of list of POS tags (``str``).
        """

        # Call model
        (
            logits,
            index_map,
        ) = super().__call__(input_text, use_delim=use_delim, **kwargs)

        # Get labels
        id2label = self.model.config.id2label

        # Post-process results
        output_text = []
        for sent_data in zip(input_text, index_map):
            output_sent = []
            for input_char, logits_index in zip(*sent_data):
                if logits_index is None or input_char.isspace():
                    label = 'WHITESPACE'
                else:
                    label = id2label[np.argmax(logits[logits_index])]
                output_sent.append(label)
            output_text.append(output_sent)

        return output_text

################################################################################################################################

[docs]class CkipNerChunker(CkipTokenClassification):
    """The named-entity recognition driver.

        Parameters
        ----------
            level : ``str`` *optional*, defaults to 3, must be 1—3
                The model level. The higher the level is, the more accurate and slower the model is.
            device : ``int``, *optional*, defaults to -1,
                Device ordinal for CPU/GPU supports.
                Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
    """

    _model_names = {
        1: 'ckiplab/albert-tiny-chinese-ner',
        2: 'ckiplab/albert-base-chinese-ner',
        3: 'ckiplab/bert-base-chinese-ner',
    }

    def __init__(self,
        level: int = 3,
        **kwargs,
    ):
        model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
        super().__init__(model_name=model_name, **kwargs)

[docs]    def __call__(self,
        input_text: List[str],
        *,
        use_delim: bool = False,
        **kwargs,
    ) -> List[List[NerToken]]:
        """Call the driver.

        Parameters
        ----------
            input_text : ``List[str]``
                The input sentences. Each sentence is a string or a list or string (words).
            use_delim : ``bool``, *optional*, defaults to False
                Segment sentence (internally) using ``delim_set``.
            delim_set : `str`, *optional*, defaults to ``'，,。：:；;！!？?'``
                Used for sentence segmentation if ``use_delim=True``.
            batch_size : ``int``, *optional*, defaults to 256
                The size of mini-batch.
            max_length : ``int``, *optional*
                The maximum length of the sentence,
                must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
            show_progress : ``int``, *optional*, defaults to True
                Show progress bar.

        Returns
        -------
            ``List[List[NerToken]]``
                A list of list of entities (:class:`~.util.NerToken`).
        """

        # Call model
        (
            logits,
            index_map,
        ) = super().__call__(input_text, use_delim=use_delim, **kwargs)

        # Get labels
        id2label = self.model.config.id2label

        # Post-process results
        output_text = []
        for sent_data in zip(input_text, index_map):
            output_sent = []
            entity_word = None
            entity_ner = None
            entity_idx0 = None
            for index_char, (input_char, logits_index,) in enumerate(zip(*sent_data)):
                if logits_index is None:
                    label = 'O'
                else:
                    label = id2label[np.argmax(logits[logits_index])]

                if label == 'O':
                    entity_ner = None
                    continue

                bioes, ner = label.split('-')

                if bioes == 'S':
                    output_sent.append(NerToken(
                        word = input_char,
                        ner  = ner,
                        idx  = (index_char, index_char+len(input_char),),
                    ))
                    entity_ner = None
                elif bioes == 'B':
                    entity_word = input_char
                    entity_ner = ner
                    entity_idx0 = index_char
                elif bioes == 'I':
                    if entity_ner == ner:
                        entity_word += input_char
                    else:
                        entity_ner = None
                elif bioes == 'E':
                    if entity_ner == ner:
                        entity_word += input_char
                        output_sent.append(NerToken(
                            word = entity_word,
                            ner  = entity_ner,
                            idx  = (entity_idx0, index_char+len(input_char),),
                        ))
                    entity_ner = None

            output_text.append(output_sent)

        return output_text