Diff of /trp.py [000000] .. [302778]

Switch to unified view

a b/trp.py
1
import json
2
3
class BoundingBox:
4
    def __init__(self, width, height, left, top):
5
        self._width = width
6
        self._height = height
7
        self._left = left
8
        self._top = top
9
10
    def __str__(self):
11
        return "width: {}, height: {}, left: {}, top: {}".format(self._width, self._height, self._left, self._top)
12
13
    @property
14
    def width(self):
15
        return self._width
16
17
    @property
18
    def height(self):
19
        return self._height
20
21
    @property
22
    def left(self):
23
        return self._left
24
25
    @property
26
    def top(self):
27
        return self._top
28
29
class Polygon:
30
    def __init__(self, x, y):
31
        self._x = x
32
        self._y = y
33
34
    def __str__(self):
35
        return "x: {}, y: {}".format(self._x, self._y)
36
37
    @property
38
    def x(self):
39
        return self._x
40
41
    @property
42
    def y(self):
43
        return self._y
44
45
class Geometry:
46
    def __init__(self, geometry):
47
        boundingBox = geometry["BoundingBox"]
48
        polygon = geometry["Polygon"]
49
        bb = BoundingBox(boundingBox["Width"], boundingBox["Height"], boundingBox["Left"], boundingBox["Top"])
50
        pgs = []
51
        for pg in polygon:
52
            pgs.append(Polygon(pg["X"], pg["Y"]))
53
54
        self._boundingBox = bb
55
        self._polygon = pgs
56
57
    def __str__(self):
58
        s = "BoundingBox: {}\n".format(str(self._boundingBox))
59
        return s
60
61
    @property
62
    def boundingBox(self):
63
        return self._boundingBox
64
65
    @property
66
    def polygon(self):
67
        return self._polygon
68
69
class Word:
70
    def __init__(self, block, blockMap):
71
        self._block = block
72
        self._confidence = block['Confidence']
73
        self._geometry = Geometry(block['Geometry'])
74
        self._id = block['Id']
75
        self._text = ""
76
        if(block['Text']):
77
            self._text = block['Text']
78
79
    def __str__(self):
80
        return self._text
81
82
    @property
83
    def confidence(self):
84
        return self._confidence
85
86
    @property
87
    def geometry(self):
88
        return self._geometry
89
90
    @property
91
    def id(self):
92
        return self._id
93
94
    @property
95
    def text(self):
96
        return self._text
97
98
    @property
99
    def block(self):
100
        return self._block
101
102
class Line:
103
    def __init__(self, block, blockMap):
104
105
        self._block = block
106
        self._confidence = block['Confidence']
107
        self._geometry = Geometry(block['Geometry'])
108
        self._id = block['Id']
109
110
        self._text = ""
111
        if(block['Text']):
112
            self._text = block['Text']
113
114
        self._words = []
115
        if('Relationships' in block and block['Relationships']):
116
            for rs in block['Relationships']:
117
                if(rs['Type'] == 'CHILD'):
118
                    for cid in rs['Ids']:
119
                        if(blockMap[cid]["BlockType"] == "WORD"):
120
                            self._words.append(Word(blockMap[cid], blockMap))
121
    def __str__(self):
122
        s = "Line\n==========\n"
123
        s = s + self._text + "\n"
124
        s = s + "Words\n----------\n"
125
        for word in self._words:
126
            s = s + "[{}]".format(str(word))
127
        return s
128
129
    @property
130
    def confidence(self):
131
        return self._confidence
132
133
    @property
134
    def geometry(self):
135
        return self._geometry
136
137
    @property
138
    def id(self):
139
        return self._id
140
141
    @property
142
    def words(self):
143
        return self._words
144
145
    @property
146
    def text(self):
147
        return self._text
148
149
    @property
150
    def block(self):
151
        return self._block
152
153
class SelectionElement:
154
    def __init__(self, block, blockMap):
155
        self._confidence = block['Confidence']
156
        self._geometry = Geometry(block['Geometry'])
157
        self._id = block['Id']
158
        self._selectionStatus = block['SelectionStatus']
159
160
    @property
161
    def confidence(self):
162
        return self._confidence
163
164
    @property
165
    def geometry(self):
166
        return self._geometry
167
168
    @property
169
    def id(self):
170
        return self._id
171
172
    @property
173
    def selectionStatus(self):
174
        return self._selectionStatus
175
176
class FieldKey:
177
    def __init__(self, block, children, blockMap):
178
        self._block = block
179
        self._confidence = block['Confidence']
180
        self._geometry = Geometry(block['Geometry'])
181
        self._id = block['Id']
182
        self._text = ""
183
        self._content = []
184
185
        t = []
186
187
        for eid in children:
188
            wb = blockMap[eid]
189
            if(wb['BlockType'] == "WORD"):
190
                w = Word(wb, blockMap)
191
                self._content.append(w)
192
                t.append(w.text)
193
194
        if(t):
195
            self._text = ' '.join(t)
196
197
    def __str__(self):
198
        return self._text
199
200
    @property
201
    def confidence(self):
202
        return self._confidence
203
204
    @property
205
    def geometry(self):
206
        return self._geometry
207
208
    @property
209
    def id(self):
210
        return self._id
211
212
    @property
213
    def content(self):
214
        return self._content
215
216
    @property
217
    def text(self):
218
        return self._text
219
220
    @property
221
    def block(self):
222
        return self._block
223
224
class FieldValue:
225
    def __init__(self, block, children, blockMap):
226
        self._block = block
227
        self._confidence = block['Confidence']
228
        self._geometry = Geometry(block['Geometry'])
229
        self._id = block['Id']
230
        self._text = ""
231
        self._content = []
232
233
        t = []
234
235
        for eid in children:
236
            wb = blockMap[eid]
237
            if(wb['BlockType'] == "WORD"):
238
                w = Word(wb, blockMap)
239
                self._content.append(w)
240
                t.append(w.text)
241
            elif(wb['BlockType'] == "SELECTION_ELEMENT"):
242
                se = SelectionElement(wb, blockMap)
243
                self._content.append(se)
244
                self._text = se.selectionStatus
245
246
        if(t):
247
            self._text = ' '.join(t)
248
249
    def __str__(self):
250
        return self._text
251
252
    @property
253
    def confidence(self):
254
        return self._confidence
255
256
    @property
257
    def geometry(self):
258
        return self._geometry
259
260
    @property
261
    def id(self):
262
        return self._id
263
264
    @property
265
    def content(self):
266
        return self._content
267
268
    @property
269
    def text(self):
270
        return self._text
271
272
    @property
273
    def block(self):
274
        return self._block
275
276
class Field:
277
    def __init__(self, block, blockMap):
278
        self._key = None
279
        self._value = None
280
281
        for item in block['Relationships']:
282
            if(item["Type"] == "CHILD"):
283
                self._key = FieldKey(block, item['Ids'], blockMap)
284
            elif(item["Type"] == "VALUE"):
285
                for eid in item['Ids']:
286
                    vkvs = blockMap[eid]
287
                    if 'VALUE' in vkvs['EntityTypes']:
288
                        if('Relationships' in vkvs):
289
                            for vitem in vkvs['Relationships']:
290
                                if(vitem["Type"] == "CHILD"):
291
                                    self._value = FieldValue(vkvs, vitem['Ids'], blockMap)
292
    def __str__(self):
293
        s = "\nField\n==========\n"
294
        k = ""
295
        v = ""
296
        if(self._key):
297
            k = str(self._key)
298
        if(self._value):
299
            v = str(self._value)
300
        s = s + "Key: {}\nValue: {}".format(k, v)
301
        return s
302
303
    @property
304
    def key(self):
305
        return self._key
306
307
    @property
308
    def value(self):
309
        return self._value
310
311
class Form:
312
    def __init__(self):
313
        self._fields = []
314
        self._fieldsMap = {}
315
316
    def addField(self, field):
317
        self._fields.append(field)
318
        self._fieldsMap[field.key.text] = field
319
320
    def __str__(self):
321
        s = ""
322
        for field in self._fields:
323
            s = s + str(field) + "\n"
324
        return s
325
326
    @property
327
    def fields(self):
328
        return self._fields
329
330
    def getFieldByKey(self, key):
331
        field = None
332
        if(key in self._fieldsMap):
333
            field = self._fieldsMap[key]
334
        return field
335
336
    def searchFieldsByKey(self, key):
337
        searchKey = key.lower()
338
        results = []
339
        for field in self._fields:
340
            if(field.key and searchKey in field.key.text.lower()):
341
                results.append(field)
342
        return results
343
344
class Cell:
345
346
    def __init__(self, block, blockMap):
347
        self._block = block
348
        self._confidence = block['Confidence']
349
        self._rowIndex = block['RowIndex']
350
        self._columnIndex = block['ColumnIndex']
351
        self._rowSpan = block['RowSpan']
352
        self._columnSpan = block['ColumnSpan']
353
        self._geometry = Geometry(block['Geometry'])
354
        self._id = block['Id']
355
        self._content = []
356
        self._text = ""
357
        if('Relationships' in block and block['Relationships']):
358
            for rs in block['Relationships']:
359
                if(rs['Type'] == 'CHILD'):
360
                    for cid in rs['Ids']:
361
                        blockType = blockMap[cid]["BlockType"]
362
                        if(blockType == "WORD"):
363
                            w = Word(blockMap[cid], blockMap)
364
                            self._content.append(w)
365
                            self._text = self._text + w.text + ' '
366
                        elif(blockType == "SELECTION_ELEMENT"):
367
                            se = SelectionElement(blockMap[cid], blockMap)
368
                            self._content.append(se)
369
                            self._text = self._text + se.selectionStatus + ', '
370
371
    def __str__(self):
372
        return self._text
373
374
    @property
375
    def confidence(self):
376
        return self._confidence
377
378
    @property
379
    def rowIndex(self):
380
        return self._rowIndex
381
382
    @property
383
    def columnIndex(self):
384
        return self._columnIndex
385
386
    @property
387
    def rowSpan(self):
388
        return self._rowSpan
389
390
    @property
391
    def columnSpan(self):
392
        return self._columnSpan
393
394
    @property
395
    def geometry(self):
396
        return self._geometry
397
398
    @property
399
    def id(self):
400
        return self._id
401
402
    @property
403
    def content(self):
404
        return self._content
405
406
    @property
407
    def text(self):
408
        return self._text
409
410
    @property
411
    def block(self):
412
        return self._block
413
414
class Row:
415
    def __init__(self):
416
        self._cells = []
417
418
    def __str__(self):
419
        s = ""
420
        for cell in self._cells:
421
            s = s + "[{}]".format(str(cell))
422
        return s
423
424
    @property
425
    def cells(self):
426
        return self._cells
427
428
class Table:
429
430
    def __init__(self, block, blockMap):
431
432
        self._block = block
433
434
        self._confidence = block['Confidence']
435
        self._geometry = Geometry(block['Geometry'])
436
437
        self._id = block['Id']
438
        self._rows = []
439
440
        ri = 1
441
        row = Row()
442
        cell = None
443
        if('Relationships' in block and block['Relationships']):
444
            for rs in block['Relationships']:
445
                if(rs['Type'] == 'CHILD'):
446
                    for cid in rs['Ids']:
447
                        cell = Cell(blockMap[cid], blockMap)
448
                        if(cell.rowIndex > ri):
449
                            self._rows.append(row)
450
                            row = Row()
451
                            ri = cell.rowIndex
452
                        row.cells.append(cell)
453
                    if(row and row.cells):
454
                        self._rows.append(row)
455
456
    def __str__(self):
457
        s = "Table\n==========\n"
458
        for row in self._rows:
459
            s = s + "Row\n==========\n"
460
            s = s + str(row) + "\n"
461
        return s
462
463
    @property
464
    def confidence(self):
465
        return self._confidence
466
467
    @property
468
    def geometry(self):
469
        return self._geometry
470
471
    @property
472
    def id(self):
473
        return self._id
474
475
    @property
476
    def rows(self):
477
        return self._rows
478
479
    @property
480
    def block(self):
481
        return self._block
482
483
class Page:
484
485
    def __init__(self, blocks, blockMap):
486
        self._blocks = blocks
487
        self._text = ""
488
        self._lines = []
489
        self._form = Form()
490
        self._tables = []
491
        self._content = []
492
493
        self._parse(blockMap)
494
495
    def __str__(self):
496
        s = "Page\n==========\n"
497
        for item in self._content:
498
            s = s + str(item) + "\n"
499
        return s
500
501
    def _parse(self, blockMap):
502
        for item in self._blocks:
503
            if item["BlockType"] == "PAGE":
504
                self._geometry = Geometry(item['Geometry'])
505
                self._id = item['Id']
506
            elif item["BlockType"] == "LINE":
507
                l = Line(item, blockMap)
508
                self._lines.append(l)
509
                self._content.append(l)
510
                self._text = self._text + l.text + '\n'
511
            elif item["BlockType"] == "TABLE":
512
                t = Table(item, blockMap)
513
                self._tables.append(t)
514
                self._content.append(t)
515
            elif item["BlockType"] == "KEY_VALUE_SET":
516
                if 'KEY' in item['EntityTypes']:
517
                    f = Field(item, blockMap)
518
                    if(f.key):
519
                        self._form.addField(f)
520
                        self._content.append(f)
521
                    else:
522
                        print("WARNING: Detected K/V where key does not have content. Excluding key from output.")
523
                        print(f)
524
                        print(item)
525
526
    def getLinesInReadingOrder(self):
527
        columns = []
528
        lines = []
529
        for item in self._lines:
530
                column_found=False
531
                for index, column in enumerate(columns):
532
                    bbox_left = item.geometry.boundingBox.left
533
                    bbox_right = item.geometry.boundingBox.left + item.geometry.boundingBox.width
534
                    bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2
535
                    column_centre = column['left'] + column['right']/2
536
                    if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right):
537
                        #Bbox appears inside the column
538
                        lines.append([index, item.text])
539
                        column_found=True
540
                        break
541
                if not column_found:
542
                    columns.append({'left':item.geometry.boundingBox.left, 'right':item.geometry.boundingBox.left + item.geometry.boundingBox.width})
543
                    lines.append([len(columns)-1, item.text])
544
545
        lines.sort(key=lambda x: x[0])
546
        return lines
547
548
    def getTextInReadingOrder(self):
549
        lines = self.getLinesInReadingOrder()
550
        text = ""
551
        for line in lines:
552
            text = text + line[1] + '\n'
553
        return text
554
555
    @property
556
    def blocks(self):
557
        return self._blocks
558
559
    @property
560
    def text(self):
561
        return self._text
562
563
    @property
564
    def lines(self):
565
        return self._lines
566
567
    @property
568
    def form(self):
569
        return self._form
570
571
    @property
572
    def tables(self):
573
        return self._tables
574
575
    @property
576
    def content(self):
577
        return self._content
578
579
    @property
580
    def geometry(self):
581
        return self._geometry
582
583
    @property
584
    def id(self):
585
        return self._id
586
587
class Document:
588
589
    def __init__(self, responsePages):
590
591
        if(not isinstance(responsePages, list)):
592
            rps = []
593
            rps.append(responsePages)
594
            responsePages = rps
595
596
        self._responsePages = responsePages
597
        self._pages = []
598
599
        self._parse()
600
601
    def __str__(self):
602
        s = "\nDocument\n==========\n"
603
        for p in self._pages:
604
            s = s + str(p) + "\n\n"
605
        return s
606
607
    def _parseDocumentPagesAndBlockMap(self):
608
609
        blockMap = {}
610
611
        documentPages = []
612
        documentPage = None
613
        for page in self._responsePages:
614
            for block in page['Blocks']:
615
                if('BlockType' in block and 'Id' in block):
616
                    blockMap[block['Id']] = block
617
618
                if(block['BlockType'] == 'PAGE'):
619
                    if(documentPage):
620
                        documentPages.append({"Blocks" : documentPage})
621
                    documentPage = []
622
                    documentPage.append(block)
623
                else:
624
                    documentPage.append(block)
625
        if(documentPage):
626
            documentPages.append({"Blocks" : documentPage})
627
        return documentPages, blockMap
628
629
    def _parse(self):
630
631
        self._responseDocumentPages, self._blockMap = self._parseDocumentPagesAndBlockMap()
632
        for documentPage in self._responseDocumentPages:
633
            page = Page(documentPage["Blocks"], self._blockMap)
634
            self._pages.append(page)
635
636
    @property
637
    def blocks(self):
638
        return self._responsePages
639
640
    @property
641
    def pageBlocks(self):
642
        return self._responseDocumentPages
643
644
    @property
645
    def pages(self):
646
        return self._pages
647
648
    def getBlockById(self, blockId):
649
        block = None
650
        if(self._blockMap and blockId in self._blockMap):
651
            block = self._blockMap[blockId]
652
        return block