|
a |
|
b/notebooks/sentences/sentences.md |
|
|
1 |
--- |
|
|
2 |
jupyter: |
|
|
3 |
jupytext: |
|
|
4 |
formats: ipynb,md |
|
|
5 |
text_representation: |
|
|
6 |
extension: .md |
|
|
7 |
format_name: markdown |
|
|
8 |
format_version: '1.3' |
|
|
9 |
jupytext_version: 1.11.4 |
|
|
10 |
kernelspec: |
|
|
11 |
display_name: Python 3 |
|
|
12 |
language: python |
|
|
13 |
name: python3 |
|
|
14 |
--- |
|
|
15 |
|
|
|
16 |
```python |
|
|
17 |
%reload_ext autoreload |
|
|
18 |
%autoreload 2 |
|
|
19 |
``` |
|
|
20 |
|
|
|
21 |
```python |
|
|
22 |
# Importation du "contexte", ie la bibliothèque sans installation |
|
|
23 |
import context |
|
|
24 |
``` |
|
|
25 |
|
|
|
26 |
```python |
|
|
27 |
import spacy |
|
|
28 |
``` |
|
|
29 |
|
|
|
30 |
```python |
|
|
31 |
from edsnlp.pipelines.sentences import SentenceSegmenter |
|
|
32 |
``` |
|
|
33 |
|
|
|
34 |
# Sentences |
|
|
35 |
|
|
|
36 |
```python |
|
|
37 |
import re |
|
|
38 |
import spacy |
|
|
39 |
|
|
|
40 |
from spacy.tokenizer import Tokenizer |
|
|
41 |
from spacy.util import compile_prefix_regex, compile_suffix_regex |
|
|
42 |
|
|
|
43 |
# Ajout de règles supplémentaires pour gérer les infix |
|
|
44 |
def custom_tokenizer(nlp): |
|
|
45 |
infix_re = re.compile(r'''[\,\?\:\;\‘\’\`\“\”\"\'~/\(\)\.\+=(->)\$]''') |
|
|
46 |
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes + ['-']) |
|
|
47 |
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) |
|
|
48 |
return Tokenizer( |
|
|
49 |
nlp.vocab, |
|
|
50 |
prefix_search=prefix_re.search, |
|
|
51 |
suffix_search=suffix_re.search, |
|
|
52 |
infix_finditer=infix_re.finditer, |
|
|
53 |
) |
|
|
54 |
|
|
|
55 |
def new_nlp(): |
|
|
56 |
|
|
|
57 |
nlp = spacy.blank('fr') |
|
|
58 |
nlp.tokenizer = custom_tokenizer(nlp) |
|
|
59 |
|
|
|
60 |
return nlp |
|
|
61 |
``` |
|
|
62 |
|
|
|
63 |
```python |
|
|
64 |
text = ( |
|
|
65 |
"Le patient est admis pour des douleurs dans le bras droit, mais n'a pas de problème de locomotion. " |
|
|
66 |
"Historique d'AVC dans la famille mais\n" |
|
|
67 |
"pourrait être un cas de rhume\n" |
|
|
68 |
"Pourrait aussi être un cas de rhume.\n" |
|
|
69 |
"Motif :\n" |
|
|
70 |
"-problème de locomotions\n" |
|
|
71 |
"Douleurs dans le bras droit.\n\n\n\n" |
|
|
72 |
) |
|
|
73 |
``` |
|
|
74 |
|
|
|
75 |
```python |
|
|
76 |
nlp = new_nlp() |
|
|
77 |
nlp.add_pipe('sentencizer') |
|
|
78 |
``` |
|
|
79 |
|
|
|
80 |
```python |
|
|
81 |
doc = nlp(text) |
|
|
82 |
``` |
|
|
83 |
|
|
|
84 |
```python |
|
|
85 |
for sent in doc.sents: |
|
|
86 |
print('##', repr(sent.text)) |
|
|
87 |
``` |
|
|
88 |
|
|
|
89 |
```python |
|
|
90 |
nlp = new_nlp() |
|
|
91 |
``` |
|
|
92 |
|
|
|
93 |
```python |
|
|
94 |
sentencer = SentenceSegmenter() |
|
|
95 |
``` |
|
|
96 |
|
|
|
97 |
```python |
|
|
98 |
doc = sentencer(nlp(text)) |
|
|
99 |
``` |
|
|
100 |
|
|
|
101 |
```python |
|
|
102 |
for sent in doc.sents: |
|
|
103 |
print('##', repr(sent.text)) |
|
|
104 |
``` |
|
|
105 |
|
|
|
106 |
Note that the newline character is now linked to the preceding sentence. That is especially relevant if the note ends on a newline. |
|
|
107 |
|
|
|
108 |
```python |
|
|
109 |
|
|
|
110 |
``` |