|
a |
|
b/notebooks/tokenizer/tokenizer.md |
|
|
1 |
--- |
|
|
2 |
jupyter: |
|
|
3 |
jupytext: |
|
|
4 |
formats: ipynb,md |
|
|
5 |
text_representation: |
|
|
6 |
extension: .md |
|
|
7 |
format_name: markdown |
|
|
8 |
format_version: "1.3" |
|
|
9 |
jupytext_version: 1.11.4 |
|
|
10 |
kernelspec: |
|
|
11 |
display_name: Python 3 |
|
|
12 |
language: python |
|
|
13 |
name: python3 |
|
|
14 |
--- |
|
|
15 |
|
|
|
16 |
```python |
|
|
17 |
%reload_ext autoreload |
|
|
18 |
%autoreload 2 |
|
|
19 |
``` |
|
|
20 |
|
|
|
21 |
```python |
|
|
22 |
# Importation du "contexte", ie la bibliothèque sans installation |
|
|
23 |
import context |
|
|
24 |
``` |
|
|
25 |
|
|
|
26 |
```python |
|
|
27 |
import spacy |
|
|
28 |
``` |
|
|
29 |
|
|
|
30 |
```python |
|
|
31 |
# One-shot import of all declared spaCy components |
|
|
32 |
|
|
|
33 |
``` |
|
|
34 |
|
|
|
35 |
# Baselines |
|
|
36 |
|
|
|
37 |
```python |
|
|
38 |
import re |
|
|
39 |
import spacy |
|
|
40 |
|
|
|
41 |
from spacy.tokenizer import Tokenizer |
|
|
42 |
from spacy.util import compile_prefix_regex, compile_suffix_regex |
|
|
43 |
|
|
|
44 |
# Ajout de règles supplémentaires pour gérer les infix |
|
|
45 |
def custom_tokenizer(nlp): |
|
|
46 |
infix_re = re.compile(r'''[\,\?\:\;\‘\’\`\“\”\"\'~/\(\)\.\+=(->)\$]''') |
|
|
47 |
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes + ['-']) |
|
|
48 |
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) |
|
|
49 |
return Tokenizer( |
|
|
50 |
nlp.vocab, |
|
|
51 |
prefix_search=prefix_re.search, |
|
|
52 |
suffix_search=suffix_re.search, |
|
|
53 |
infix_finditer=infix_re.finditer, |
|
|
54 |
) |
|
|
55 |
|
|
|
56 |
def new_nlp(): |
|
|
57 |
|
|
|
58 |
nlp = spacy.blank('fr') |
|
|
59 |
nlp.tokenizer = custom_tokenizer(nlp) |
|
|
60 |
|
|
|
61 |
return nlp |
|
|
62 |
``` |
|
|
63 |
|
|
|
64 |
```python |
|
|
65 |
nlp = new_nlp() |
|
|
66 |
``` |
|
|
67 |
|
|
|
68 |
```python |
|
|
69 |
# nlp.add_pipe('sentencizer') |
|
|
70 |
nlp.add_pipe('matcher', config=dict(regex=dict(douleurs=['blème de locomotion', 'douleurs', 'IMV']))) |
|
|
71 |
nlp.add_pipe('sections') |
|
|
72 |
nlp.add_pipe('pollution') |
|
|
73 |
``` |
|
|
74 |
|
|
|
75 |
```python |
|
|
76 |
text = ( |
|
|
77 |
"Le patient est admis pour des douleurs dans le bras droit, mais n'a pas de problème de locomotion. Test(et oui) " |
|
|
78 |
"Historique d'AVC dans la famille. pourrait être un cas de rhume.\n" |
|
|
79 |
"NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNBWbWbNbNbNBWbNbWbNbWBNbNbWbNbNBNbWbWbNbWBNbNbWbNBNbWbWbNb\n" |
|
|
80 |
"IMV--deshabillé\n" |
|
|
81 |
"Pourrait être un cas de rhume.\n" |
|
|
82 |
"Motif :\n" |
|
|
83 |
"-problème de locomotions==+test\n" |
|
|
84 |
"Douleurs dans le bras droit." |
|
|
85 |
) |
|
|
86 |
``` |
|
|
87 |
|
|
|
88 |
```python |
|
|
89 |
doc = nlp(text) |
|
|
90 |
``` |
|
|
91 |
|
|
|
92 |
```python |
|
|
93 |
doc.ents |
|
|
94 |
``` |
|
|
95 |
|
|
|
96 |
```python |
|
|
97 |
doc[19] |
|
|
98 |
``` |
|
|
99 |
|
|
|
100 |
```python |
|
|
101 |
doc._.sections |
|
|
102 |
``` |
|
|
103 |
|
|
|
104 |
```python |
|
|
105 |
doc._.clean_ |
|
|
106 |
``` |
|
|
107 |
|
|
|
108 |
```python |
|
|
109 |
doc[17]._.ascii_ |
|
|
110 |
``` |
|
|
111 |
|
|
|
112 |
```python |
|
|
113 |
doc._.clean_ |
|
|
114 |
``` |
|
|
115 |
|
|
|
116 |
On peut tester l'extraction d'entité dans le texte nettoyé : |
|
|
117 |
|
|
|
118 |
```python |
|
|
119 |
doc_clean = nlp(doc._.clean_) |
|
|
120 |
``` |
|
|
121 |
|
|
|
122 |
```python |
|
|
123 |
ent = doc_clean[64:68] |
|
|
124 |
ent |
|
|
125 |
``` |
|
|
126 |
|
|
|
127 |
Les deux textes ne sont plus alignés : |
|
|
128 |
|
|
|
129 |
```python |
|
|
130 |
doc.text[ent.start_char:ent.end_char] |
|
|
131 |
``` |
|
|
132 |
|
|
|
133 |
Mais la méthode `char_clean_span` permet de réaligner les deux représentations : |
|
|
134 |
|
|
|
135 |
```python |
|
|
136 |
doc._.char_clean_span(ent.start_char, ent.end_char) |
|
|
137 |
``` |
|
|
138 |
|
|
|
139 |
```python |
|
|
140 |
|
|
|
141 |
``` |