[1bdb11]: / aitrika / extractors / pdf_extractor.py

Download this file

78 lines (69 with data), 2.9 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import re
from typing import Tuple
from PyPDF2 import PdfReader
from Bio import Entrez
from aitrika.utils.load_spacy_model import load_spacy_model
from aitrika.config.config import Config
class PDFExtractor:
config = Config()
def __init__(self, pdf_path: str):
self.pdf_path = pdf_path
def extract_title_and_authors(self) -> Tuple[str, str]:
with open(self.pdf_path, "rb") as f:
reader = PdfReader(f)
first_page = reader.pages[0].extract_text()
lines = first_page.split("\n")
pre_header = [line.strip() for line in lines if line.strip()]
original_header = pre_header[:]
pre_header = [re.sub(r"\d+", "", s) for s in pre_header]
authors = self._detect_authors(pre_header)
title = self._detect_title(pre_header)
original_title = original_header[pre_header.index(title)]
original_authors = original_header[pre_header.index(authors)]
title = re.sub(r"\b\d+\b", "", original_title).strip()
authors = re.sub(r"\d+", "", original_authors).strip().split(",")
authors = [author.replace("*", "") for author in authors]
authors = ", ".join(authors)
authors = re.sub(r"\b(and)\b", "", authors)
authors = re.sub(r",\s+", ", ", authors).strip()
return title, authors
def _detect_authors(self, text: list) -> str:
nlp = load_spacy_model()
for s in text:
doc = nlp(s)
names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
if names:
return s
return None
def _detect_title(self, text: list) -> str:
author_string = self._detect_authors(text)
abstract_string = next(
(s for s in text if s.lower().startswith("abstract")), None
)
special_strings = [s for s in text if s.startswith(("[", "{", "("))]
title_strings = [
s
for s in text
if s not in [author_string, abstract_string] + special_strings
]
return title_strings[0] if title_strings else None
def retrieve_pubmed_id(self, title: str, authors: str) -> str:
query = f"({title}) AND ({authors})[Author]"
Entrez.email = self.config.ENTREZ_EMAIL
handle = Entrez.esearch(
db="pubmed", rettype="medline", retmode="text", term=query
)
record = Entrez.read(handle)
id_paper = record["IdList"][0]
handle = Entrez.efetch(
db="pubmed", id=id_paper, rettype="medline", retmode="text"
)
records = Entrez.read(handle)
return records["PMID"]
def extract_full_text(self) -> str:
with open(self.pdf_path, "rb") as f:
reader = PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text()
return text