#!/usr/bin/env python3
# -*- coding:utf-8 -*-
"""
This module implements the CKIP Transformers NLP drivers.
"""
__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2020 CKIP Lab'
__license__ = 'GPL-3.0'
from typing import (
List,
)
import numpy as np
from .util import (
CkipTokenClassification,
NerToken,
)
################################################################################################################################
[docs]class CkipWordSegmenter(CkipTokenClassification):
"""The word segmentation driver.
Parameters
----------
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
"""
_model_names = {
1: 'ckiplab/albert-tiny-chinese-ws',
2: 'ckiplab/albert-base-chinese-ws',
3: 'ckiplab/bert-base-chinese-ws',
}
def __init__(self,
level: int = 3,
**kwargs,
):
model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)
[docs] def __call__(self,
input_text: List[str],
*,
use_delim: bool = False,
**kwargs,
) -> List[List[str]]:
"""Call the driver.
Parameters
----------
input_text : ``List[str]``
The input sentences. Each sentence is a string.
use_delim : ``bool``, *optional*, defaults to False
Segment sentence (internally) using ``delim_set``.
delim_set : `str`, *optional*, defaults to ``',,。::;;!!??'``
Used for sentence segmentation if ``use_delim=True``.
batch_size : ``int``, *optional*, defaults to 256
The size of mini-batch.
max_length : ``int``, *optional*
The maximum length of the sentence,
must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
show_progress : ``int``, *optional*, defaults to True
Show progress bar.
Returns
-------
``List[List[NerToken]]``
A list of list of words (``str``).
"""
# Call model
(
logits,
index_map,
) = super().__call__(input_text, use_delim=use_delim, **kwargs)
# Post-process results
output_text = []
for sent_data in zip(input_text, index_map):
output_sent = []
word = ''
for input_char, logits_index in zip(*sent_data):
if logits_index is None:
if word:
output_sent.append(word)
output_sent.append(input_char)
word = ''
else:
logits_b, logits_i = logits[logits_index]
if logits_b > logits_i:
if word:
output_sent.append(word)
word = input_char
else:
word += input_char
if word:
output_sent.append(word)
output_text.append(output_sent)
return output_text
################################################################################################################################
[docs]class CkipPosTagger(CkipTokenClassification):
"""The part-of-speech tagging driver.
Parameters
----------
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
"""
_model_names = {
1: 'ckiplab/albert-tiny-chinese-pos',
2: 'ckiplab/albert-base-chinese-pos',
3: 'ckiplab/bert-base-chinese-pos',
}
def __init__(self,
level: int = 3,
**kwargs,
):
model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)
[docs] def __call__(self,
input_text: List[List[str]],
*,
use_delim: bool = True,
**kwargs,
) -> List[List[str]]:
"""Call the driver.
Parameters
----------
input_text : ``List[List[str]]``
The input sentences. Each sentence is a list of strings (words).
use_delim : ``bool``, *optional*, defaults to True
Segment sentence (internally) using ``delim_set``.
delim_set : `str`, *optional*, defaults to ``',,。::;;!!??'``
Used for sentence segmentation if ``use_delim=True``.
batch_size : ``int``, *optional*, defaults to 256
The size of mini-batch.
max_length : ``int``, *optional*
The maximum length of the sentence,
must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
show_progress : ``int``, *optional*, defaults to True
Show progress bar.
Returns
-------
``List[List[NerToken]]``
A list of list of POS tags (``str``).
"""
# Call model
(
logits,
index_map,
) = super().__call__(input_text, use_delim=use_delim, **kwargs)
# Get labels
id2label = self.model.config.id2label
# Post-process results
output_text = []
for sent_data in zip(input_text, index_map):
output_sent = []
for input_char, logits_index in zip(*sent_data):
if logits_index is None or input_char.isspace():
label = 'WHITESPACE'
else:
label = id2label[np.argmax(logits[logits_index])]
output_sent.append(label)
output_text.append(output_sent)
return output_text
################################################################################################################################
[docs]class CkipNerChunker(CkipTokenClassification):
"""The named-entity recognition driver.
Parameters
----------
level : ``str`` *optional*, defaults to 3, must be 1—3
The model level. The higher the level is, the more accurate and slower the model is.
device : ``int``, *optional*, defaults to -1,
Device ordinal for CPU/GPU supports.
Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id.
"""
_model_names = {
1: 'ckiplab/albert-tiny-chinese-ner',
2: 'ckiplab/albert-base-chinese-ner',
3: 'ckiplab/bert-base-chinese-ner',
}
def __init__(self,
level: int = 3,
**kwargs,
):
model_name = kwargs.pop('model_name', self._get_model_name_from_level(level))
super().__init__(model_name=model_name, **kwargs)
[docs] def __call__(self,
input_text: List[str],
*,
use_delim: bool = False,
**kwargs,
) -> List[List[NerToken]]:
"""Call the driver.
Parameters
----------
input_text : ``List[str]``
The input sentences. Each sentence is a string or a list or string (words).
use_delim : ``bool``, *optional*, defaults to False
Segment sentence (internally) using ``delim_set``.
delim_set : `str`, *optional*, defaults to ``',,。::;;!!??'``
Used for sentence segmentation if ``use_delim=True``.
batch_size : ``int``, *optional*, defaults to 256
The size of mini-batch.
max_length : ``int``, *optional*
The maximum length of the sentence,
must not longer then the maximum sequence length for this model (i.e. ``tokenizer.model_max_length``).
show_progress : ``int``, *optional*, defaults to True
Show progress bar.
Returns
-------
``List[List[NerToken]]``
A list of list of entities (:class:`~.util.NerToken`).
"""
# Call model
(
logits,
index_map,
) = super().__call__(input_text, use_delim=use_delim, **kwargs)
# Get labels
id2label = self.model.config.id2label
# Post-process results
output_text = []
for sent_data in zip(input_text, index_map):
output_sent = []
entity_word = None
entity_ner = None
entity_idx0 = None
for index_char, (input_char, logits_index,) in enumerate(zip(*sent_data)):
if logits_index is None:
label = 'O'
else:
label = id2label[np.argmax(logits[logits_index])]
if label == 'O':
entity_ner = None
continue
bioes, ner = label.split('-')
if bioes == 'S':
output_sent.append(NerToken(
word = input_char,
ner = ner,
idx = (index_char, index_char+len(input_char),),
))
entity_ner = None
elif bioes == 'B':
entity_word = input_char
entity_ner = ner
entity_idx0 = index_char
elif bioes == 'I':
if entity_ner == ner:
entity_word += input_char
else:
entity_ner = None
elif bioes == 'E':
if entity_ner == ner:
entity_word += input_char
output_sent.append(NerToken(
word = entity_word,
ner = entity_ner,
idx = (entity_idx0, index_char+len(input_char),),
))
entity_ner = None
output_text.append(output_sent)
return output_text