Diff of /trp.py [000000] .. [302778]

Switch to side-by-side view

--- a
+++ b/trp.py
@@ -0,0 +1,652 @@
+import json
+
+class BoundingBox:
+    def __init__(self, width, height, left, top):
+        self._width = width
+        self._height = height
+        self._left = left
+        self._top = top
+
+    def __str__(self):
+        return "width: {}, height: {}, left: {}, top: {}".format(self._width, self._height, self._left, self._top)
+
+    @property
+    def width(self):
+        return self._width
+
+    @property
+    def height(self):
+        return self._height
+
+    @property
+    def left(self):
+        return self._left
+
+    @property
+    def top(self):
+        return self._top
+
+class Polygon:
+    def __init__(self, x, y):
+        self._x = x
+        self._y = y
+
+    def __str__(self):
+        return "x: {}, y: {}".format(self._x, self._y)
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def y(self):
+        return self._y
+
+class Geometry:
+    def __init__(self, geometry):
+        boundingBox = geometry["BoundingBox"]
+        polygon = geometry["Polygon"]
+        bb = BoundingBox(boundingBox["Width"], boundingBox["Height"], boundingBox["Left"], boundingBox["Top"])
+        pgs = []
+        for pg in polygon:
+            pgs.append(Polygon(pg["X"], pg["Y"]))
+
+        self._boundingBox = bb
+        self._polygon = pgs
+
+    def __str__(self):
+        s = "BoundingBox: {}\n".format(str(self._boundingBox))
+        return s
+
+    @property
+    def boundingBox(self):
+        return self._boundingBox
+
+    @property
+    def polygon(self):
+        return self._polygon
+
+class Word:
+    def __init__(self, block, blockMap):
+        self._block = block
+        self._confidence = block['Confidence']
+        self._geometry = Geometry(block['Geometry'])
+        self._id = block['Id']
+        self._text = ""
+        if(block['Text']):
+            self._text = block['Text']
+
+    def __str__(self):
+        return self._text
+
+    @property
+    def confidence(self):
+        return self._confidence
+
+    @property
+    def geometry(self):
+        return self._geometry
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def block(self):
+        return self._block
+
+class Line:
+    def __init__(self, block, blockMap):
+
+        self._block = block
+        self._confidence = block['Confidence']
+        self._geometry = Geometry(block['Geometry'])
+        self._id = block['Id']
+
+        self._text = ""
+        if(block['Text']):
+            self._text = block['Text']
+
+        self._words = []
+        if('Relationships' in block and block['Relationships']):
+            for rs in block['Relationships']:
+                if(rs['Type'] == 'CHILD'):
+                    for cid in rs['Ids']:
+                        if(blockMap[cid]["BlockType"] == "WORD"):
+                            self._words.append(Word(blockMap[cid], blockMap))
+    def __str__(self):
+        s = "Line\n==========\n"
+        s = s + self._text + "\n"
+        s = s + "Words\n----------\n"
+        for word in self._words:
+            s = s + "[{}]".format(str(word))
+        return s
+
+    @property
+    def confidence(self):
+        return self._confidence
+
+    @property
+    def geometry(self):
+        return self._geometry
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def words(self):
+        return self._words
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def block(self):
+        return self._block
+
+class SelectionElement:
+    def __init__(self, block, blockMap):
+        self._confidence = block['Confidence']
+        self._geometry = Geometry(block['Geometry'])
+        self._id = block['Id']
+        self._selectionStatus = block['SelectionStatus']
+
+    @property
+    def confidence(self):
+        return self._confidence
+
+    @property
+    def geometry(self):
+        return self._geometry
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def selectionStatus(self):
+        return self._selectionStatus
+
+class FieldKey:
+    def __init__(self, block, children, blockMap):
+        self._block = block
+        self._confidence = block['Confidence']
+        self._geometry = Geometry(block['Geometry'])
+        self._id = block['Id']
+        self._text = ""
+        self._content = []
+
+        t = []
+
+        for eid in children:
+            wb = blockMap[eid]
+            if(wb['BlockType'] == "WORD"):
+                w = Word(wb, blockMap)
+                self._content.append(w)
+                t.append(w.text)
+
+        if(t):
+            self._text = ' '.join(t)
+
+    def __str__(self):
+        return self._text
+
+    @property
+    def confidence(self):
+        return self._confidence
+
+    @property
+    def geometry(self):
+        return self._geometry
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def content(self):
+        return self._content
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def block(self):
+        return self._block
+
+class FieldValue:
+    def __init__(self, block, children, blockMap):
+        self._block = block
+        self._confidence = block['Confidence']
+        self._geometry = Geometry(block['Geometry'])
+        self._id = block['Id']
+        self._text = ""
+        self._content = []
+
+        t = []
+
+        for eid in children:
+            wb = blockMap[eid]
+            if(wb['BlockType'] == "WORD"):
+                w = Word(wb, blockMap)
+                self._content.append(w)
+                t.append(w.text)
+            elif(wb['BlockType'] == "SELECTION_ELEMENT"):
+                se = SelectionElement(wb, blockMap)
+                self._content.append(se)
+                self._text = se.selectionStatus
+
+        if(t):
+            self._text = ' '.join(t)
+
+    def __str__(self):
+        return self._text
+
+    @property
+    def confidence(self):
+        return self._confidence
+
+    @property
+    def geometry(self):
+        return self._geometry
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def content(self):
+        return self._content
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def block(self):
+        return self._block
+
+class Field:
+    def __init__(self, block, blockMap):
+        self._key = None
+        self._value = None
+
+        for item in block['Relationships']:
+            if(item["Type"] == "CHILD"):
+                self._key = FieldKey(block, item['Ids'], blockMap)
+            elif(item["Type"] == "VALUE"):
+                for eid in item['Ids']:
+                    vkvs = blockMap[eid]
+                    if 'VALUE' in vkvs['EntityTypes']:
+                        if('Relationships' in vkvs):
+                            for vitem in vkvs['Relationships']:
+                                if(vitem["Type"] == "CHILD"):
+                                    self._value = FieldValue(vkvs, vitem['Ids'], blockMap)
+    def __str__(self):
+        s = "\nField\n==========\n"
+        k = ""
+        v = ""
+        if(self._key):
+            k = str(self._key)
+        if(self._value):
+            v = str(self._value)
+        s = s + "Key: {}\nValue: {}".format(k, v)
+        return s
+
+    @property
+    def key(self):
+        return self._key
+
+    @property
+    def value(self):
+        return self._value
+
+class Form:
+    def __init__(self):
+        self._fields = []
+        self._fieldsMap = {}
+
+    def addField(self, field):
+        self._fields.append(field)
+        self._fieldsMap[field.key.text] = field
+
+    def __str__(self):
+        s = ""
+        for field in self._fields:
+            s = s + str(field) + "\n"
+        return s
+
+    @property
+    def fields(self):
+        return self._fields
+
+    def getFieldByKey(self, key):
+        field = None
+        if(key in self._fieldsMap):
+            field = self._fieldsMap[key]
+        return field
+
+    def searchFieldsByKey(self, key):
+        searchKey = key.lower()
+        results = []
+        for field in self._fields:
+            if(field.key and searchKey in field.key.text.lower()):
+                results.append(field)
+        return results
+
+class Cell:
+
+    def __init__(self, block, blockMap):
+        self._block = block
+        self._confidence = block['Confidence']
+        self._rowIndex = block['RowIndex']
+        self._columnIndex = block['ColumnIndex']
+        self._rowSpan = block['RowSpan']
+        self._columnSpan = block['ColumnSpan']
+        self._geometry = Geometry(block['Geometry'])
+        self._id = block['Id']
+        self._content = []
+        self._text = ""
+        if('Relationships' in block and block['Relationships']):
+            for rs in block['Relationships']:
+                if(rs['Type'] == 'CHILD'):
+                    for cid in rs['Ids']:
+                        blockType = blockMap[cid]["BlockType"]
+                        if(blockType == "WORD"):
+                            w = Word(blockMap[cid], blockMap)
+                            self._content.append(w)
+                            self._text = self._text + w.text + ' '
+                        elif(blockType == "SELECTION_ELEMENT"):
+                            se = SelectionElement(blockMap[cid], blockMap)
+                            self._content.append(se)
+                            self._text = self._text + se.selectionStatus + ', '
+
+    def __str__(self):
+        return self._text
+
+    @property
+    def confidence(self):
+        return self._confidence
+
+    @property
+    def rowIndex(self):
+        return self._rowIndex
+
+    @property
+    def columnIndex(self):
+        return self._columnIndex
+
+    @property
+    def rowSpan(self):
+        return self._rowSpan
+
+    @property
+    def columnSpan(self):
+        return self._columnSpan
+
+    @property
+    def geometry(self):
+        return self._geometry
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def content(self):
+        return self._content
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def block(self):
+        return self._block
+
+class Row:
+    def __init__(self):
+        self._cells = []
+
+    def __str__(self):
+        s = ""
+        for cell in self._cells:
+            s = s + "[{}]".format(str(cell))
+        return s
+
+    @property
+    def cells(self):
+        return self._cells
+
+class Table:
+
+    def __init__(self, block, blockMap):
+
+        self._block = block
+
+        self._confidence = block['Confidence']
+        self._geometry = Geometry(block['Geometry'])
+
+        self._id = block['Id']
+        self._rows = []
+
+        ri = 1
+        row = Row()
+        cell = None
+        if('Relationships' in block and block['Relationships']):
+            for rs in block['Relationships']:
+                if(rs['Type'] == 'CHILD'):
+                    for cid in rs['Ids']:
+                        cell = Cell(blockMap[cid], blockMap)
+                        if(cell.rowIndex > ri):
+                            self._rows.append(row)
+                            row = Row()
+                            ri = cell.rowIndex
+                        row.cells.append(cell)
+                    if(row and row.cells):
+                        self._rows.append(row)
+
+    def __str__(self):
+        s = "Table\n==========\n"
+        for row in self._rows:
+            s = s + "Row\n==========\n"
+            s = s + str(row) + "\n"
+        return s
+
+    @property
+    def confidence(self):
+        return self._confidence
+
+    @property
+    def geometry(self):
+        return self._geometry
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def rows(self):
+        return self._rows
+
+    @property
+    def block(self):
+        return self._block
+
+class Page:
+
+    def __init__(self, blocks, blockMap):
+        self._blocks = blocks
+        self._text = ""
+        self._lines = []
+        self._form = Form()
+        self._tables = []
+        self._content = []
+
+        self._parse(blockMap)
+
+    def __str__(self):
+        s = "Page\n==========\n"
+        for item in self._content:
+            s = s + str(item) + "\n"
+        return s
+
+    def _parse(self, blockMap):
+        for item in self._blocks:
+            if item["BlockType"] == "PAGE":
+                self._geometry = Geometry(item['Geometry'])
+                self._id = item['Id']
+            elif item["BlockType"] == "LINE":
+                l = Line(item, blockMap)
+                self._lines.append(l)
+                self._content.append(l)
+                self._text = self._text + l.text + '\n'
+            elif item["BlockType"] == "TABLE":
+                t = Table(item, blockMap)
+                self._tables.append(t)
+                self._content.append(t)
+            elif item["BlockType"] == "KEY_VALUE_SET":
+                if 'KEY' in item['EntityTypes']:
+                    f = Field(item, blockMap)
+                    if(f.key):
+                        self._form.addField(f)
+                        self._content.append(f)
+                    else:
+                        print("WARNING: Detected K/V where key does not have content. Excluding key from output.")
+                        print(f)
+                        print(item)
+
+    def getLinesInReadingOrder(self):
+        columns = []
+        lines = []
+        for item in self._lines:
+                column_found=False
+                for index, column in enumerate(columns):
+                    bbox_left = item.geometry.boundingBox.left
+                    bbox_right = item.geometry.boundingBox.left + item.geometry.boundingBox.width
+                    bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2
+                    column_centre = column['left'] + column['right']/2
+                    if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right):
+                        #Bbox appears inside the column
+                        lines.append([index, item.text])
+                        column_found=True
+                        break
+                if not column_found:
+                    columns.append({'left':item.geometry.boundingBox.left, 'right':item.geometry.boundingBox.left + item.geometry.boundingBox.width})
+                    lines.append([len(columns)-1, item.text])
+
+        lines.sort(key=lambda x: x[0])
+        return lines
+
+    def getTextInReadingOrder(self):
+        lines = self.getLinesInReadingOrder()
+        text = ""
+        for line in lines:
+            text = text + line[1] + '\n'
+        return text
+
+    @property
+    def blocks(self):
+        return self._blocks
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def lines(self):
+        return self._lines
+
+    @property
+    def form(self):
+        return self._form
+
+    @property
+    def tables(self):
+        return self._tables
+
+    @property
+    def content(self):
+        return self._content
+
+    @property
+    def geometry(self):
+        return self._geometry
+
+    @property
+    def id(self):
+        return self._id
+
+class Document:
+
+    def __init__(self, responsePages):
+
+        if(not isinstance(responsePages, list)):
+            rps = []
+            rps.append(responsePages)
+            responsePages = rps
+
+        self._responsePages = responsePages
+        self._pages = []
+
+        self._parse()
+
+    def __str__(self):
+        s = "\nDocument\n==========\n"
+        for p in self._pages:
+            s = s + str(p) + "\n\n"
+        return s
+
+    def _parseDocumentPagesAndBlockMap(self):
+
+        blockMap = {}
+
+        documentPages = []
+        documentPage = None
+        for page in self._responsePages:
+            for block in page['Blocks']:
+                if('BlockType' in block and 'Id' in block):
+                    blockMap[block['Id']] = block
+
+                if(block['BlockType'] == 'PAGE'):
+                    if(documentPage):
+                        documentPages.append({"Blocks" : documentPage})
+                    documentPage = []
+                    documentPage.append(block)
+                else:
+                    documentPage.append(block)
+        if(documentPage):
+            documentPages.append({"Blocks" : documentPage})
+        return documentPages, blockMap
+
+    def _parse(self):
+
+        self._responseDocumentPages, self._blockMap = self._parseDocumentPagesAndBlockMap()
+        for documentPage in self._responseDocumentPages:
+            page = Page(documentPage["Blocks"], self._blockMap)
+            self._pages.append(page)
+
+    @property
+    def blocks(self):
+        return self._responsePages
+
+    @property
+    def pageBlocks(self):
+        return self._responseDocumentPages
+
+    @property
+    def pages(self):
+        return self._pages
+
+    def getBlockById(self, blockId):
+        block = None
+        if(self._blockMap and blockId in self._blockMap):
+            block = self._blockMap[blockId]
+        return block