-
Notifications
You must be signed in to change notification settings - Fork 59
Open
Description
Hi, there.
The English model seems to be able to detect the named entity well, but the ja_ginza model does not.
Code:
from typing import List
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
from presidio_analyzer import PatternRecognizer
class NumbersRecognizer(EntityRecognizer):
expected_confidence_level = 0.7
def load(self) -> None:
pass
def analyze(
self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
) -> List[RecognizerResult]:
results = []
for token in nlp_artifacts.tokens:
if token.like_num:
result = RecognizerResult(
entity_type = 'NUMBER',
start = token.idx,
end = token.idx + len(token),
score = self.expected_confidence_level
)
results.append(result)
return results
print('en test')
analyzer = AnalyzerEngine()
new_numbers_recognizer = NumbersRecognizer(supported_entities = ['NUMBER'])
analyzer.registry.add_recognizer(new_numbers_recognizer)
text3 = 'Roberto lives in Five 10 Broad st.'
numbers_results2 = analyzer.analyze(text = text3, language = 'en')
print('\n'.join([str(res) for res in numbers_results2]))
print('\nja test')
analyzer = AnalyzerEngine(
nlp_engine = NlpEngineProvider(
nlp_configuration = {
'nlp_engine_name': 'spacy', 'models': [
{'lang_code': 'ja', 'model_name': 'ja_ginza'}
]
}
).create_engine(),
supported_languages = ['ja']
)
money_recognizer = NumbersRecognizer(supported_entities = ['Money'])
analyzer.registry.add_recognizer(money_recognizer)
text3 = '¥20'
numbers_results2 = analyzer.analyze(text = text3, language = 'ja')
print('\n'.join([str(res) for res in numbers_results2]))
Output:
root@presidio:/home/root# python3 test.py
/usr/local/lib/python3.8/dist-packages/pandas/compat/_optional.py:149: UserWarning: Pandas requires version '1.3.1' or newer of 'bottleneck' (version '1.2.1' currently installed).
warnings.warn(msg, UserWarning)
en test
type: PERSON, start: 0, end: 7, score: 0.85
type: NUMBER, start: 17, end: 21, score: 0.7
type: NUMBER, start: 22, end: 24, score: 0.7
ja test
Could you give me some advice?
Metadata
Metadata
Assignees
Labels
No labels