Source code for ckip_transformers.nlp.driver

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module implements the CKIP Transformers NLP drivers.
"""

__author__ = "Mu Yang <http://muyang.pro>"
__copyright__ = "2023 CKIP Lab"
__license__ = "GPL-3.0"

from typing import (
    List,
)

import numpy as np

from .util import (
    CkipTokenClassification,
    NerToken,
)

################################################################################################################################


[docs]class CkipWordSegmenter(CkipTokenClassification):
    """The word segmentation driver.

    Parameters
    ----------
        model : ``str`` *optional*, defaults to "bert-base".
            The pretrained model name provided by CKIP Transformers.
        model_name : ``str`` *optional*, overwrites **model**
            The custom pretrained model name (e.g. ``'ckiplab/bert-base-chinese-ws'``).
        device : ``int`` or ``torch.device``, *optional*, defaults to -1
            Device ordinal for CPU/GPU supports.
            Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
    """

    _model_names = {
        "albert-tiny": "ckiplab/albert-tiny-chinese-ws",
        "albert-base": "ckiplab/albert-base-chinese-ws",
        "bert-tiny": "ckiplab/bert-tiny-chinese-ws",
        "bert-base": "ckiplab/bert-base-chinese-ws",
    }

    def __init__(
        self,
        model: str = "bert-base",
        **kwargs,
    ):
        model_name = kwargs.pop("model_name", self._get_model_name(model))
        super().__init__(model_name=model_name, **kwargs)

[docs]    def __call__(
        self,
        input_text: List[str],
        *,
        use_delim: bool = False,
        **kwargs,
    ) -> List[List[str]]:
        """Call the driver.

        Parameters
        ----------
            input_text : ``List[str]``
                The input sentences. Each sentence is a string.
            use_delim : ``bool``, *optional*, defaults to False
                Segment sentence (internally) using ``delim_set``.
            delim_set : `str`, *optional*, defaults to ``'，,。：:；;！!？?'``
                Used for sentence segmentation if ``use_delim=True``.
            batch_size : ``int``, *optional*, defaults to 256
                The size of mini-batch.
            max_length : ``int``, *optional*
                The maximum length of the sentence,
                must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
            show_progress : ``int``, *optional*, defaults to True
                Show progress bar.
            pin_memory : ``bool``, *optional*, defaults to True
                Pin memory in order to accelerate the speed of data transfer to the GPU. This option is incompatible with
                multiprocessing. Disabled on CPU device.

        Returns
        -------
            ``List[List[str]]``
                A list of list of words (``str``).
        """

        # Call model
        (
            logits,
            index_map,
        ) = super().__call__(input_text, use_delim=use_delim, **kwargs)

        # Post-process results
        output_text = []
        for sent_data in zip(input_text, index_map):
            output_sent = []
            word = ""
            for input_char, logits_index in zip(*sent_data):
                if logits_index is None:
                    if word:
                        output_sent.append(word)
                    output_sent.append(input_char)
                    word = ""
                else:
                    logits_b, logits_i = logits[logits_index]

                    if logits_b > logits_i:
                        if word:
                            output_sent.append(word)
                        word = input_char
                    else:
                        word += input_char

            if word:
                output_sent.append(word)
            output_text.append(output_sent)

        return output_text


################################################################################################################################


[docs]class CkipPosTagger(CkipTokenClassification):
    """The part-of-speech tagging driver.

    Parameters
    ----------
        model : ``str`` *optional*, defaults to "bert-base".
            The pretrained model name provided by CKIP Transformers.
        model_name : ``str`` *optional*, overwrites **model**
            The custom pretrained model name (e.g. ``'ckiplab/bert-base-chinese-pos'``).
        device : ``int`` or ``torch.device``, *optional*, defaults to -1
            Device ordinal for CPU/GPU supports.
            Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
    """

    _model_names = {
        "albert-tiny": "ckiplab/albert-tiny-chinese-pos",
        "albert-base": "ckiplab/albert-base-chinese-pos",
        "bert-tiny": "ckiplab/bert-tiny-chinese-pos",
        "bert-base": "ckiplab/bert-base-chinese-pos",
    }

    def __init__(
        self,
        model: str = "bert-base",
        **kwargs,
    ):
        model_name = kwargs.pop("model_name", self._get_model_name(model))
        super().__init__(model_name=model_name, **kwargs)

[docs]    def __call__(
        self,
        input_text: List[List[str]],
        *,
        use_delim: bool = True,
        **kwargs,
    ) -> List[List[str]]:
        """Call the driver.

        Parameters
        ----------
            input_text : ``List[List[str]]``
                The input sentences. Each sentence is a list of strings (words).
            use_delim : ``bool``, *optional*, defaults to True
                Segment sentence (internally) using ``delim_set``.
            delim_set : `str`, *optional*, defaults to ``'，,。：:；;！!？?'``
                Used for sentence segmentation if ``use_delim=True``.
            batch_size : ``int``, *optional*, defaults to 256
                The size of mini-batch.
            max_length : ``int``, *optional*
                The maximum length of the sentence,
                must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
            show_progress : ``int``, *optional*, defaults to True
                Show progress bar.
            pin_memory : ``bool``, *optional*, defaults to True
                Pin memory in order to accelerate the speed of data transfer to the GPU. This option is incompatible with
                multiprocessing. Disabled on CPU device.

        Returns
        -------
            ``List[List[str]]``
                A list of list of POS tags (``str``).
        """

        # Call model
        (
            logits,
            index_map,
        ) = super().__call__(input_text, use_delim=use_delim, **kwargs)

        # Get labels
        id2label = self.model.config.id2label

        # Post-process results
        output_text = []
        for sent_data in zip(input_text, index_map):
            output_sent = []
            for input_char, logits_index in zip(*sent_data):
                if logits_index is None or input_char.isspace():
                    label = "WHITESPACE"
                else:
                    label = id2label[np.argmax(logits[logits_index])]
                output_sent.append(label)
            output_text.append(output_sent)

        return output_text


################################################################################################################################


[docs]class CkipNerChunker(CkipTokenClassification):
    """The named-entity recognition driver.

    Parameters
    ----------
        model : ``str`` *optional*, defaults to "bert-base".
            The pretrained model name provided by CKIP Transformers.
        model_name : ``str`` *optional*, overwrites **model**
            The custom pretrained model name (e.g. ``'ckiplab/bert-base-chinese-ner'``).
        device : ``int`` or ``torch.device``, *optional*, defaults to -1
            Device ordinal for CPU/GPU supports.
            Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
    """

    _model_names = {
        "albert-tiny": "ckiplab/albert-tiny-chinese-ner",
        "albert-base": "ckiplab/albert-base-chinese-ner",
        "bert-tiny": "ckiplab/bert-tiny-chinese-ner",
        "bert-base": "ckiplab/bert-base-chinese-ner",
    }

    def __init__(
        self,
        model: str = "bert-base",
        **kwargs,
    ):
        model_name = kwargs.pop("model_name", self._get_model_name(model))
        super().__init__(model_name=model_name, **kwargs)

[docs]    def __call__(
        self,
        input_text: List[str],
        *,
        use_delim: bool = False,
        **kwargs,
    ) -> List[List[NerToken]]:
        """Call the driver.

        Parameters
        ----------
            input_text : ``List[str]``
                The input sentences. Each sentence is a string.
            use_delim : ``bool``, *optional*, defaults to False
                Segment sentence (internally) using ``delim_set``.
            delim_set : `str`, *optional*, defaults to ``'，,。：:；;！!？?'``
                Used for sentence segmentation if ``use_delim=True``.
            batch_size : ``int``, *optional*, defaults to 256
                The size of mini-batch.
            max_length : ``int``, *optional*
                The maximum length of the sentence,
                must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
            show_progress : ``int``, *optional*, defaults to True
                Show progress bar.
            pin_memory : ``bool``, *optional*, defaults to True
                Pin memory in order to accelerate the speed of data transfer to the GPU. This option is incompatible with
                multiprocessing. Disabled on CPU device.

        Returns
        -------
            ``List[List[NerToken]]``
                A list of list of entities (:class:`~.util.NerToken`).
        """

        # Call model
        (
            logits,
            index_map,
        ) = super().__call__(input_text, use_delim=use_delim, **kwargs)

        # Get labels
        id2label = self.model.config.id2label

        # Post-process results
        output_text = []
        for sent_data in zip(input_text, index_map):
            output_sent = []
            entity_word = None
            entity_ner = None
            entity_idx0 = None
            for index_char, (
                input_char,
                logits_index,
            ) in enumerate(zip(*sent_data)):
                if logits_index is None:
                    label = "O"
                else:
                    label = id2label[np.argmax(logits[logits_index])]

                if label == "O":
                    entity_ner = None
                    continue

                bioes, ner = label.split("-")

                if bioes == "S":
                    output_sent.append(
                        NerToken(
                            word=input_char,
                            ner=ner,
                            idx=(
                                index_char,
                                index_char + len(input_char),
                            ),
                        )
                    )
                    entity_ner = None
                elif bioes == "B":
                    entity_word = input_char
                    entity_ner = ner
                    entity_idx0 = index_char
                elif bioes == "I":
                    if entity_ner == ner:
                        entity_word += input_char
                    else:
                        entity_ner = None
                elif bioes == "E":
                    if entity_ner == ner:
                        entity_word += input_char
                        output_sent.append(
                            NerToken(
                                word=entity_word,
                                ner=entity_ner,
                                idx=(
                                    entity_idx0,
                                    index_char + len(input_char),
                                ),
                            )
                        )
                    entity_ner = None

            output_text.append(output_sent)

        return output_text