From 123227aa8c11238ae94c72ba0506f2d476d0ac57 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 22 Nov 2018 05:31:02 +0530 Subject: [PATCH 01/13] Add TextEdge and TextEdges helper classes --- camelot/__version__.py | 11 ++++- camelot/core.py | 93 +++++++++++++++++++++++++++++++++++++++ camelot/parsers/stream.py | 35 +++++++++++++-- 3 files changed, 134 insertions(+), 5 deletions(-) diff --git a/camelot/__version__.py b/camelot/__version__.py index 22adbc44..f19ff5e2 100644 --- a/camelot/__version__.py +++ b/camelot/__version__.py @@ -1,11 +1,18 @@ # -*- coding: utf-8 -*- -VERSION = (0, 3, 2) +VERSION = (0, 4, 0) +PHASE = 'alpha' # alpha, beta or rc +PHASE_VERSION = '1' __title__ = 'camelot-py' __description__ = 'PDF Table Extraction for Humans.' __url__ = 'http://camelot-py.readthedocs.io/' -__version__ = '.'.join(map(str, VERSION)) +if PHASE: + __version__ = '{}-{}'.format('.'.join(map(str, VERSION)), PHASE) + if PHASE_VERSION: + __version__ = '{}.{}'.format(__version__, PHASE_VERSION) +else: + __version__ = '.'.join(map(str, VERSION)) __author__ = 'Vinayak Mehta' __author_email__ = 'vmehta94@gmail.com' __license__ = 'MIT License' diff --git a/camelot/core.py b/camelot/core.py index 45b316bb..66d1c283 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -3,11 +3,104 @@ import os import zipfile import tempfile +from itertools import chain import numpy as np import pandas as pd +class TextEdge(object): + def __init__(self, x, y0, y1, align='left'): + self.x = x + self.y0 = y0 + self.y1 = y1 + self.align = align + self.intersections = 0 + self.is_valid = False + + def __repr__(self): + return ''.format( + round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) + + def update_coords(self, x, y0): + self.x = (self.intersections * self.x + x) / float(self.intersections + 1) + self.y0 = y0 + self.intersections += 1 + # a textedge is valid if it extends uninterrupted over required_elements + if self.intersections > 4: + self.is_valid = True + + +class TextEdges(object): + def __init__(self): + self._textedges = {'left': [], 'middle': [], 'right': []} + + @staticmethod + def get_x_coord(textline, align): + x_left = textline.x0 + x_right = textline.x1 + x_middle = x_left + (x_right - x_left) / 2.0 + x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right} + return x_coord[align] + + def add_textedge(self, textline, align): + x = self.get_x_coord(textline, align) + y0 = textline.y0 + y1 = textline.y1 + te = TextEdge(x, y0, y1, align=align) + self._textedges[align].append(te) + + def find_textedge(self, x_coord, align): + for i, te in enumerate(self._textedges[align]): + if np.isclose(te.x, x_coord): + return i + return None + + def update_textedges(self, textline): + for align in ['left', 'middle', 'right']: + x_coord = self.get_x_coord(textline, align) + idx = self.find_textedge(x_coord, align) + if idx is None: + print('adding') + self.add_textedge(textline, align) + else: + print('updating') + self._textedges[align][idx].update_coords(x_coord, textline.y0) + + def generate_textedges(self, textlines): + textlines_flat = list(chain.from_iterable(textlines)) + for tl in textlines_flat: + if len(tl.get_text().strip()) > 1: # TODO: hacky + self.update_textedges(tl) + + # # debug + # import matplotlib.pyplot as plt + + # fig = plt.figure() + # ax = fig.add_subplot(111, aspect='equal') + # for te in self._textedges['left']: + # if te.is_valid: + # ax.plot([te.x, te.x], [te.y0, te.y1]) + # plt.show() + + # fig = plt.figure() + # ax = fig.add_subplot(111, aspect='equal') + # for te in self._textedges['middle']: + # if te.is_valid: + # ax.plot([te.x, te.x], [te.y0, te.y1]) + # plt.show() + + # fig = plt.figure() + # ax = fig.add_subplot(111, aspect='equal') + # for te in self._textedges['right']: + # if te.is_valid: + # ax.plot([te.x, te.x], [te.y0, te.y1]) + # plt.show() + + def generate_tableareas(self): + return {} + + class Cell(object): """Defines a cell in a table with coordinates relative to a left-bottom origin. (PDF coordinate space) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 709f01de..55ef7ca8 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -9,7 +9,7 @@ import pandas as pd from .base import BaseParser -from ..core import Table +from ..core import TextEdges, Table from ..utils import (text_in_bbox, get_table_index, compute_accuracy, compute_whitespace) @@ -116,7 +116,7 @@ def _group_rows(text, row_close_tol=2): row_y = t.y0 temp.append(t) rows.append(sorted(temp, key=lambda t: t.x0)) - __ = rows.pop(0) # hacky + __ = rows.pop(0) # TODO: hacky return rows @staticmethod @@ -246,6 +246,34 @@ def _validate_columns(self): raise ValueError("Length of table_areas and columns" " should be equal") + def _nurminen_table_detection(self, textlines): + # an general heuristic implementation of the table detection + # algorithm described by Anssi Nurminen's master's thesis: + # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 + + # minimum number of textlines to be considered a textedge + REQUIRED_ELEMENTS_FOR_TEXTEDGE = 4 + # padding added to table area's lt and rb + TABLE_AREA_PADDING = 10 + + # TODO: add support for arabic text #141 + # sort textlines in reading order + textlines.sort(key=lambda x: (-x.y0, x.x0)) + # group textlines into rows + text_grouped = self._group_rows( + self.horizontal_text, row_close_tol=self.row_close_tol) + textedges = TextEdges() + # generate left, middle and right textedges + textedges.generate_textedges(text_grouped) + # select relevant edges + # generate table areas using relevant edges and horizontal text + table_bbox = textedges.generate_tableareas() + # treat whole page as table if not table areas found + if not len(table_bbox): + table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} + + return table_bbox + def _generate_table_bbox(self): if self.table_areas is not None: table_bbox = {} @@ -257,7 +285,8 @@ def _generate_table_bbox(self): y2 = float(y2) table_bbox[(x1, y2, x2, y1)] = None else: - table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} + # find tables based on nurminen's detection algorithm + table_bbox = self._nurminen_table_detection(self.horizontal_text) self.table_bbox = table_bbox def _generate_columns_and_rows(self, table_idx, tk): From 378408a271309d7a4034d08d21ffb0b81557adc2 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 22 Nov 2018 05:42:10 +0530 Subject: [PATCH 02/13] Remove debug statements --- camelot/core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 66d1c283..f50f77b1 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -61,10 +61,8 @@ def update_textedges(self, textline): x_coord = self.get_x_coord(textline, align) idx = self.find_textedge(x_coord, align) if idx is None: - print('adding') self.add_textedge(textline, align) else: - print('updating') self._textedges[align][idx].update_coords(x_coord, textline.y0) def generate_textedges(self, textlines): From a587ea3782a84b348fc455d3f8f0a3371f1b77e0 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 22 Nov 2018 18:24:31 +0530 Subject: [PATCH 03/13] Add get_relevant textedges method --- camelot/core.py | 77 +++++++++++++++++++++------------------ camelot/parsers/stream.py | 12 ++---- 2 files changed, 46 insertions(+), 43 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index f50f77b1..9a9882d8 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -4,11 +4,20 @@ import zipfile import tempfile from itertools import chain +from operator import itemgetter import numpy as np import pandas as pd +# minimum number of textlines to be considered a textedge +TEXTEDGE_REQUIRED_ELEMENTS = 4 +# y coordinate tolerance for extending text edge +TEXTEDGE_EXTEND_TOLERANCE = 50 +# padding added to table area's lt and rb +TABLE_AREA_PADDING = 10 + + class TextEdge(object): def __init__(self, x, y0, y1, align='left'): self.x = x @@ -23,12 +32,13 @@ def __repr__(self): round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) def update_coords(self, x, y0): - self.x = (self.intersections * self.x + x) / float(self.intersections + 1) - self.y0 = y0 - self.intersections += 1 - # a textedge is valid if it extends uninterrupted over required_elements - if self.intersections > 4: - self.is_valid = True + if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE): + self.x = (self.intersections * self.x + x) / float(self.intersections + 1) + self.y0 = y0 + self.intersections += 1 + # a textedge is valid if it extends uninterrupted over required_elements + if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS: + self.is_valid = True class TextEdges(object): @@ -43,59 +53,56 @@ def get_x_coord(textline, align): x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right} return x_coord[align] - def add_textedge(self, textline, align): + def find(self, x_coord, align): + for i, te in enumerate(self._textedges[align]): + if np.isclose(te.x, x_coord): + return i + return None + + def add(self, textline, align): x = self.get_x_coord(textline, align) y0 = textline.y0 y1 = textline.y1 te = TextEdge(x, y0, y1, align=align) self._textedges[align].append(te) - def find_textedge(self, x_coord, align): - for i, te in enumerate(self._textedges[align]): - if np.isclose(te.x, x_coord): - return i - return None - - def update_textedges(self, textline): - for align in ['left', 'middle', 'right']: + def update(self, textline): + for align in ['left', 'right', 'middle']: x_coord = self.get_x_coord(textline, align) - idx = self.find_textedge(x_coord, align) + idx = self.find(x_coord, align) if idx is None: - self.add_textedge(textline, align) + self.add(textline, align) else: self._textedges[align][idx].update_coords(x_coord, textline.y0) - def generate_textedges(self, textlines): + def generate(self, textlines): textlines_flat = list(chain.from_iterable(textlines)) for tl in textlines_flat: if len(tl.get_text().strip()) > 1: # TODO: hacky - self.update_textedges(tl) + self.update(tl) - # # debug - # import matplotlib.pyplot as plt + def get_relevant(self): + intersections_sum = { + 'left': sum(te.intersections for te in self._textedges['left']), + 'right': sum(te.intersections for te in self._textedges['right']), + 'middle': sum(te.intersections for te in self._textedges['middle']) + } - # fig = plt.figure() - # ax = fig.add_subplot(111, aspect='equal') - # for te in self._textedges['left']: - # if te.is_valid: - # ax.plot([te.x, te.x], [te.y0, te.y1]) - # plt.show() + # TODO: naive + relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] + return self._textedges[relevant_align] - # fig = plt.figure() - # ax = fig.add_subplot(111, aspect='equal') - # for te in self._textedges['middle']: - # if te.is_valid: - # ax.plot([te.x, te.x], [te.y0, te.y1]) - # plt.show() + def get_table_areas(self, relevant_textedges): + # # debug + # import matplotlib.pyplot as plt # fig = plt.figure() # ax = fig.add_subplot(111, aspect='equal') - # for te in self._textedges['right']: + # for te in relevant_textedges: # if te.is_valid: # ax.plot([te.x, te.x], [te.y0, te.y1]) # plt.show() - def generate_tableareas(self): return {} diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 55ef7ca8..982b5f6a 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -251,11 +251,6 @@ def _nurminen_table_detection(self, textlines): # algorithm described by Anssi Nurminen's master's thesis: # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 - # minimum number of textlines to be considered a textedge - REQUIRED_ELEMENTS_FOR_TEXTEDGE = 4 - # padding added to table area's lt and rb - TABLE_AREA_PADDING = 10 - # TODO: add support for arabic text #141 # sort textlines in reading order textlines.sort(key=lambda x: (-x.y0, x.x0)) @@ -264,10 +259,11 @@ def _nurminen_table_detection(self, textlines): self.horizontal_text, row_close_tol=self.row_close_tol) textedges = TextEdges() # generate left, middle and right textedges - textedges.generate_textedges(text_grouped) + textedges.generate(text_grouped) # select relevant edges - # generate table areas using relevant edges and horizontal text - table_bbox = textedges.generate_tableareas() + relevant_textedges = textedges.get_relevant() + # guess table areas using relevant edges + table_bbox = textedges.get_table_areas(relevant_textedges) # treat whole page as table if not table areas found if not len(table_bbox): table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} From 4e2aee18c33fceafe77379af95c9b486d604d3e2 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 22 Nov 2018 19:48:51 +0530 Subject: [PATCH 04/13] Add get_table_areas textedges method --- camelot/core.py | 76 +++++++++++++++++++++++++++++++-------- camelot/parsers/stream.py | 3 +- 2 files changed, 63 insertions(+), 16 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 9a9882d8..c8051dc4 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -83,27 +83,73 @@ def generate(self, textlines): def get_relevant(self): intersections_sum = { - 'left': sum(te.intersections for te in self._textedges['left']), - 'right': sum(te.intersections for te in self._textedges['right']), - 'middle': sum(te.intersections for te in self._textedges['middle']) + 'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid), + 'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid), + 'middle': sum(te.intersections for te in self._textedges['middle'] if te.is_valid) } # TODO: naive + # get the vertical textedges that intersect maximum number of + # times with horizontal text rows relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] return self._textedges[relevant_align] - def get_table_areas(self, relevant_textedges): - # # debug - # import matplotlib.pyplot as plt - - # fig = plt.figure() - # ax = fig.add_subplot(111, aspect='equal') - # for te in relevant_textedges: - # if te.is_valid: - # ax.plot([te.x, te.x], [te.y0, te.y1]) - # plt.show() - - return {} + def get_table_areas(self, textlines, relevant_textedges): + def pad(area): + x0 = area[0] - TABLE_AREA_PADDING + y0 = area[1] - TABLE_AREA_PADDING + x1 = area[2] + TABLE_AREA_PADDING + y1 = area[3] + TABLE_AREA_PADDING + return (x0, y0, x1, y1) + + # sort relevant textedges in reading order + relevant_textedges.sort(key=lambda te: (-te.y0, te.x)) + + table_areas = {} + for te in relevant_textedges: + if te.is_valid: + if not table_areas: + table_areas[(te.x, te.y0, te.x, te.y1)] = None + else: + found = None + for area in table_areas: + # check for overlap + if te.y1 >= area[1] and te.y0 <= area[3]: + found = area + break + if found is None: + table_areas[(te.x, te.y0, te.x, te.y1)] = None + else: + table_areas.pop(found) + updated_area = ( + found[0], min(te.y0, found[1]), max(found[2], te.x), max(found[3], te.y1)) + table_areas[updated_area] = None + + # extend table areas based on textlines that overlap + # vertically. it's possible that these textlines were + # eliminated during textedges generation since numbers and + # sentences/chars are often aligned differently. + # drawback: table areas that have paragraphs to their left + # will include the paragraphs too. + for tl in textlines: + for area in table_areas: + found = None + # check for overlap + if tl.y0 >= area[1] and tl.y1 <= area[3]: + found = area + break + if found is not None: + table_areas.pop(found) + updated_area = ( + min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1)) + table_areas[updated_area] = None + + # add some padding to table areas + table_areas_padded = {} + for area in table_areas: + table_areas_padded[pad(area)] = None + + return table_areas_padded class Cell(object): diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 982b5f6a..2aa5fc49 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -250,6 +250,7 @@ def _nurminen_table_detection(self, textlines): # an general heuristic implementation of the table detection # algorithm described by Anssi Nurminen's master's thesis: # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 + # assumes that tables vertically separated by some distance # TODO: add support for arabic text #141 # sort textlines in reading order @@ -263,7 +264,7 @@ def _nurminen_table_detection(self, textlines): # select relevant edges relevant_textedges = textedges.get_relevant() # guess table areas using relevant edges - table_bbox = textedges.get_table_areas(relevant_textedges) + table_bbox = textedges.get_table_areas(textlines, relevant_textedges) # treat whole page as table if not table areas found if not len(table_bbox): table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} From 529914eb6f3bf387dfc64a96dd99197cbd6065d1 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 22 Nov 2018 19:50:59 +0530 Subject: [PATCH 05/13] Update comment --- camelot/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/camelot/core.py b/camelot/core.py index c8051dc4..276840c7 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -129,7 +129,7 @@ def pad(area): # vertically. it's possible that these textlines were # eliminated during textedges generation since numbers and # sentences/chars are often aligned differently. - # drawback: table areas that have paragraphs to their left + # drawback: table areas that have paragraphs to their sides # will include the paragraphs too. for tl in textlines: for area in table_areas: From bcde67fe179e57a04a58aa824ee0a514885792b2 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 22 Nov 2018 19:56:16 +0530 Subject: [PATCH 06/13] Add constant to include table headers --- camelot/core.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/camelot/core.py b/camelot/core.py index 276840c7..21737742 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -99,7 +99,9 @@ def pad(area): x0 = area[0] - TABLE_AREA_PADDING y0 = area[1] - TABLE_AREA_PADDING x1 = area[2] + TABLE_AREA_PADDING - y1 = area[3] + TABLE_AREA_PADDING + # TODO: deal in percentages instead of absolutes + # add a constant to include table headers + y1 = area[3] + TABLE_AREA_PADDING + 10 return (x0, y0, x1, y1) # sort relevant textedges in reading order @@ -149,6 +151,41 @@ def pad(area): for area in table_areas: table_areas_padded[pad(area)] = None + # debug + import matplotlib.pyplot as plt + import matplotlib.patches as patches + + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + xs, ys = [], [] + for t in textlines: + xs.extend([t.x0, t.x1]) + ys.extend([t.y0, t.y1]) + ax.add_patch( + patches.Rectangle( + (t.x0, t.y0), + t.x1 - t.x0, + t.y1 - t.y0, + color='blue' + ) + ) + for area in table_areas_padded: + xs.extend([area[0], area[2]]) + ys.extend([area[1], area[3]]) + ax.add_patch( + patches.Rectangle( + (area[0], area[1]), + area[2] - area[0], + area[3] - area[1], + fill=False, + color='red' + ) + ) + + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + plt.show() + return table_areas_padded From 9b5782f9ba58eb52955ebc949aecfa56b78fed14 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 22 Nov 2018 20:05:30 +0530 Subject: [PATCH 07/13] Fix indent --- camelot/core.py | 45 +++++---------------------------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 21737742..e9e6e0f3 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -140,52 +140,17 @@ def pad(area): if tl.y0 >= area[1] and tl.y1 <= area[3]: found = area break - if found is not None: - table_areas.pop(found) - updated_area = ( - min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1)) - table_areas[updated_area] = None + if found is not None: + table_areas.pop(found) + updated_area = ( + min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1)) + table_areas[updated_area] = None # add some padding to table areas table_areas_padded = {} for area in table_areas: table_areas_padded[pad(area)] = None - # debug - import matplotlib.pyplot as plt - import matplotlib.patches as patches - - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - xs, ys = [], [] - for t in textlines: - xs.extend([t.x0, t.x1]) - ys.extend([t.y0, t.y1]) - ax.add_patch( - patches.Rectangle( - (t.x0, t.y0), - t.x1 - t.x0, - t.y1 - t.y0, - color='blue' - ) - ) - for area in table_areas_padded: - xs.extend([area[0], area[2]]) - ys.extend([area[1], area[3]]) - ax.add_patch( - patches.Rectangle( - (area[0], area[1]), - area[2] - area[0], - area[3] - area[1], - fill=False, - color='red' - ) - ) - - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) - plt.show() - return table_areas_padded From 9b67b271e48a896ce5822b2421d600abf781cf02 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 23 Nov 2018 02:44:55 +0530 Subject: [PATCH 08/13] Add atol and fix variable declaration --- camelot/core.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index e9e6e0f3..44aff2bf 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -55,7 +55,7 @@ def get_x_coord(textline, align): def find(self, x_coord, align): for i, te in enumerate(self._textedges[align]): - if np.isclose(te.x, x_coord): + if np.isclose(te.x, x_coord, atol=0.5): return i return None @@ -134,17 +134,17 @@ def pad(area): # drawback: table areas that have paragraphs to their sides # will include the paragraphs too. for tl in textlines: + found = None for area in table_areas: - found = None # check for overlap if tl.y0 >= area[1] and tl.y1 <= area[3]: found = area break - if found is not None: - table_areas.pop(found) - updated_area = ( - min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1)) - table_areas[updated_area] = None + if found is not None: + table_areas.pop(found) + updated_area = ( + min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1)) + table_areas[updated_area] = None # add some padding to table areas table_areas_padded = {} From a1e1fd781d7cdf39707f825a2851ff376e9ff5dd Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 23 Nov 2018 02:51:22 +0530 Subject: [PATCH 09/13] Fix comments --- camelot/core.py | 17 ++++++++++------- camelot/parsers/stream.py | 8 ++++---- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 44aff2bf..e0687d25 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -10,10 +10,12 @@ import pandas as pd -# minimum number of textlines to be considered a textedge +# minimum number of vertical textline intersections for a textedge +# to be considered valid TEXTEDGE_REQUIRED_ELEMENTS = 4 -# y coordinate tolerance for extending text edge +# y coordinate tolerance for extending textedge TEXTEDGE_EXTEND_TOLERANCE = 50 +# TODO: deal in percentages instead of absolutes # padding added to table area's lt and rb TABLE_AREA_PADDING = 10 @@ -36,7 +38,8 @@ def update_coords(self, x, y0): self.x = (self.intersections * self.x + x) / float(self.intersections + 1) self.y0 = y0 self.intersections += 1 - # a textedge is valid if it extends uninterrupted over required_elements + # a textedge is valid only if it extends uninterrupted + # over a required number of textlines if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS: self.is_valid = True @@ -89,8 +92,8 @@ def get_relevant(self): } # TODO: naive - # get the vertical textedges that intersect maximum number of - # times with horizontal text rows + # get vertical textedges that intersect maximum number of + # times with horizontal textlines relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] return self._textedges[relevant_align] @@ -130,8 +133,8 @@ def pad(area): # extend table areas based on textlines that overlap # vertically. it's possible that these textlines were # eliminated during textedges generation since numbers and - # sentences/chars are often aligned differently. - # drawback: table areas that have paragraphs to their sides + # chars/words/sentences are often aligned differently. + # drawback: table areas that have paragraphs on their sides # will include the paragraphs too. for tl in textlines: found = None diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 2aa5fc49..8f86dbd7 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -247,10 +247,10 @@ def _validate_columns(self): " should be equal") def _nurminen_table_detection(self, textlines): - # an general heuristic implementation of the table detection + # a general heuristic implementation of the table detection # algorithm described by Anssi Nurminen's master's thesis: # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 - # assumes that tables vertically separated by some distance + # assumes that tables are situated relatively apart vertically # TODO: add support for arabic text #141 # sort textlines in reading order @@ -263,9 +263,9 @@ def _nurminen_table_detection(self, textlines): textedges.generate(text_grouped) # select relevant edges relevant_textedges = textedges.get_relevant() - # guess table areas using relevant edges + # guess table areas using textlines and relevant edges table_bbox = textedges.get_table_areas(textlines, relevant_textedges) - # treat whole page as table if not table areas found + # treat whole page as table area if no table areas found if not len(table_bbox): table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} From 0251422e339b568512c236f47e35c03515c06191 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 23 Nov 2018 03:27:23 +0530 Subject: [PATCH 10/13] Add fix to include table headers --- camelot/core.py | 18 +++++++++--------- camelot/parsers/stream.py | 5 +---- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index e0687d25..cc0b5a3b 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -15,8 +15,7 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4 # y coordinate tolerance for extending textedge TEXTEDGE_EXTEND_TOLERANCE = 50 -# TODO: deal in percentages instead of absolutes -# padding added to table area's lt and rb +# padding added to table area on the left, right and bottom TABLE_AREA_PADDING = 10 @@ -79,8 +78,7 @@ def update(self, textline): self._textedges[align][idx].update_coords(x_coord, textline.y0) def generate(self, textlines): - textlines_flat = list(chain.from_iterable(textlines)) - for tl in textlines_flat: + for tl in textlines: if len(tl.get_text().strip()) > 1: # TODO: hacky self.update(tl) @@ -98,13 +96,12 @@ def get_relevant(self): return self._textedges[relevant_align] def get_table_areas(self, textlines, relevant_textedges): - def pad(area): + def pad(area, average_row_height): x0 = area[0] - TABLE_AREA_PADDING y0 = area[1] - TABLE_AREA_PADDING x1 = area[2] + TABLE_AREA_PADDING - # TODO: deal in percentages instead of absolutes - # add a constant to include table headers - y1 = area[3] + TABLE_AREA_PADDING + 10 + # add a constant since table headers can be relatively up + y1 = area[3] + average_row_height * 5 return (x0, y0, x1, y1) # sort relevant textedges in reading order @@ -136,7 +133,9 @@ def pad(area): # chars/words/sentences are often aligned differently. # drawback: table areas that have paragraphs on their sides # will include the paragraphs too. + sum_textline_height = 0 for tl in textlines: + sum_textline_height += tl.y1 - tl.y0 found = None for area in table_areas: # check for overlap @@ -148,11 +147,12 @@ def pad(area): updated_area = ( min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1)) table_areas[updated_area] = None + average_textline_height = sum_textline_height / float(len(textlines)) # add some padding to table areas table_areas_padded = {} for area in table_areas: - table_areas_padded[pad(area)] = None + table_areas_padded[pad(area, average_textline_height)] = None return table_areas_padded diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 8f86dbd7..79073ac2 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -255,12 +255,9 @@ def _nurminen_table_detection(self, textlines): # TODO: add support for arabic text #141 # sort textlines in reading order textlines.sort(key=lambda x: (-x.y0, x.x0)) - # group textlines into rows - text_grouped = self._group_rows( - self.horizontal_text, row_close_tol=self.row_close_tol) textedges = TextEdges() # generate left, middle and right textedges - textedges.generate(text_grouped) + textedges.generate(textlines) # select relevant edges relevant_textedges = textedges.get_relevant() # guess table areas using textlines and relevant edges From bf894116d2a7d28424fe0799307c36be3b4beb63 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 23 Nov 2018 04:25:04 +0530 Subject: [PATCH 11/13] Update test data --- tests/data.py | 85 ++++++++++++++++++++++----------------------------- 1 file changed, 36 insertions(+), 49 deletions(-) diff --git a/tests/data.py b/tests/data.py index 00e070a9..c75a5887 100755 --- a/tests/data.py +++ b/tests/data.py @@ -33,52 +33,45 @@ ["Nagaland", "2,368,724", "204,329", "226,400", "0", "2,799,453", "783,054", "3,582,507"], ["Odisha", "14,317,179", "2,552,292", "1,107,250", "0", "17,976,721", "451,438", "18,428,159"], ["Puducherry", "4,191,757", "52,249", "192,400", "0", "4,436,406", "2,173", "4,438,579"], - ["Punjab", "19,775,485", "2,208,343", "2,470,882", "0", "24,454,710", "1,436,522", "25,891,232"], - ["", "Health Sector Financing by Centre and States/UTs in India [2009-10 to 2012-13](Revised) P a g e |23", "", "", "", "", "", ""] + ["Punjab", "19,775,485", "2,208,343", "2,470,882", "0", "24,454,710", "1,436,522", "25,891,232"] ] data_stream_table_rotated = [ - ["", "", "Table 21 Current use of contraception by background characteristics\u2014Continued", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], - ["", "", "", "", "", "", "Modern method", "", "", "", "", "", "", "Traditional method", "", "", "", ""], - ["", "", "", "Any", "", "", "", "", "", "", "Other", "Any", "", "", "", "Not", "", "Number"], - ["", "", "Any", "modern", "Female", "Male", "", "", "", "Condom/", "modern", "traditional", "", "With-", "Folk", "currently", "", "of"], - ["", "Background characteristic", "method", "method", "sterilization", "sterilization", "Pill", "IUD", "Injectables", "Nirodh", "method", "method", "Rhythm", "drawal", "method", "using", "Total", "women"], - ["", "Caste/tribe", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], - ["", "Scheduled caste", "74.8", "55.8", "42.9", "0.9", "9.7", "0.0", "0.2", "2.2", "0.0", "19.0", "11.2", "7.4", "0.4", "25.2", "100.0", "1,363"], - ["", "Scheduled tribe", "59.3", "39.0", "26.8", "0.6", "6.4", "0.6", "1.2", "3.5", "0.0", "20.3", "10.4", "5.8", "4.1", "40.7", "100.0", "256"], - ["", "Other backward class", "71.4", "51.1", "34.9", "0.0", "8.6", "1.4", "0.0", "6.2", "0.0", "20.4", "12.6", "7.8", "0.0", "28.6", "100.0", "211"], - ["", "Other", "71.1", "48.8", "28.2", "0.8", "13.3", "0.9", "0.3", "5.2", "0.1", "22.3", "12.9", "9.1", "0.3", "28.9", "100.0", "3,319"], - ["", "Wealth index", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], - ["", "Lowest", "64.5", "48.6", "34.3", "0.5", "10.5", "0.6", "0.7", "2.0", "0.0", "15.9", "9.9", "4.6", "1.4", "35.5", "100.0", "1,258"], - ["", "Second", "68.5", "50.4", "36.2", "1.1", "11.4", "0.5", "0.1", "1.1", "0.0", "18.1", "11.2", "6.7", "0.2", "31.5", "100.0", "1,317"], - ["", "Middle", "75.5", "52.8", "33.6", "0.6", "14.2", "0.4", "0.5", "3.4", "0.1", "22.7", "13.4", "8.9", "0.4", "24.5", "100.0", "1,018"], - ["", "Fourth", "73.9", "52.3", "32.0", "0.5", "12.5", "0.6", "0.2", "6.3", "0.2", "21.6", "11.5", "9.9", "0.2", "26.1", "100.0", "908"], - ["", "Highest", "78.3", "44.4", "19.5", "1.0", "9.7", "1.4", "0.0", "12.7", "0.0", "33.8", "18.2", "15.6", "0.0", "21.7", "100.0", "733"], - ["", "Number of living children", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], - ["", "No children", "25.1", "7.6", "0.3", "0.5", "2.0", "0.0", + ["Table 21 Current use of contraception by background characteristics\u2014Continued", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], + ["", "", "", "", "", "Modern method", "", "", "", "", "", "", "Traditional method", "", "", "", ""], + ["", "", "Any", "", "", "", "", "", "", "Other", "Any", "", "", "", "Not", "", "Number"], + ["", "Any", "modern", "Female", "Male", "", "", "", "Condom/", "modern", "traditional", "", "With-", "Folk", "currently", "", "of"], + ["Background characteristic", "method", "method", "sterilization", "sterilization", "Pill", "IUD", "Injectables", "Nirodh", "method", "method", "Rhythm", "drawal", "method", "using", "Total", "women"], + ["Caste/tribe", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], + ["Scheduled caste", "74.8", "55.8", "42.9", "0.9", "9.7", "0.0", "0.2", "2.2", "0.0", "19.0", "11.2", "7.4", "0.4", "25.2", "100.0", "1,363"], + ["Scheduled tribe", "59.3", "39.0", "26.8", "0.6", "6.4", "0.6", "1.2", "3.5", "0.0", "20.3", "10.4", "5.8", "4.1", "40.7", "100.0", "256"], + ["Other backward class", "71.4", "51.1", "34.9", "0.0", "8.6", "1.4", "0.0", "6.2", "0.0", "20.4", "12.6", "7.8", "0.0", "28.6", "100.0", "211"], + ["Other", "71.1", "48.8", "28.2", "0.8", "13.3", "0.9", "0.3", "5.2", "0.1", "22.3", "12.9", "9.1", "0.3", "28.9", "100.0", "3,319"], + ["Wealth index", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], + ["Lowest", "64.5", "48.6", "34.3", "0.5", "10.5", "0.6", "0.7", "2.0", "0.0", "15.9", "9.9", "4.6", "1.4", "35.5", "100.0", "1,258"], + ["Second", "68.5", "50.4", "36.2", "1.1", "11.4", "0.5", "0.1", "1.1", "0.0", "18.1", "11.2", "6.7", "0.2", "31.5", "100.0", "1,317"], + ["Middle", "75.5", "52.8", "33.6", "0.6", "14.2", "0.4", "0.5", "3.4", "0.1", "22.7", "13.4", "8.9", "0.4", "24.5", "100.0", "1,018"], + ["Fourth", "73.9", "52.3", "32.0", "0.5", "12.5", "0.6", "0.2", "6.3", "0.2", "21.6", "11.5", "9.9", "0.2", "26.1", "100.0", "908"], + ["Highest", "78.3", "44.4", "19.5", "1.0", "9.7", "1.4", "0.0", "12.7", "0.0", "33.8", "18.2", "15.6", "0.0", "21.7", "100.0", "733"], + ["Number of living children", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], + ["No children", "25.1", "7.6", "0.3", "0.5", "2.0", "0.0", "0.0", "4.8", "0.0", "17.5", "9.0", "8.5", "0.0", "74.9", "100.0", "563"], - ["", "1 child", "66.5", "32.1", "3.7", "0.7", "20.1", "0.7", "0.1", "6.9", "0.0", "34.3", "18.9", "15.2", "0.3", "33.5", "100.0", "1,190"], - ["\x18\x18", "1 son", "66.8", "33.2", "4.1", "0.7", "21.1", "0.5", "0.3", "6.6", "0.0", "33.5", "21.2", "12.3", "0.0", "33.2", "100.0", "672"], - ["", "No sons", "66.1", "30.7", "3.1", "0.6", "18.8", "0.8", "0.0", "7.3", "0.0", "35.4", "15.8", "19.0", "0.6", "33.9", "100.0", "517"], - ["", "2 children", "81.6", "60.5", "41.8", "0.9", "11.6", "0.8", "0.3", "4.8", "0.2", "21.1", "12.2", "8.3", "0.6", "18.4", "100.0", "1,576"], - ["", "1 or more sons", "83.7", "64.2", "46.4", "0.9", "10.8", "0.8", "0.4", "4.8", "0.1", "19.5", "11.1", "7.6", "0.7", "16.3", "100.0", "1,268"], - ["", "No sons", "73.2", "45.5", "23.2", "1.0", "15.1", "0.9", "0.0", "4.8", "0.5", "27.7", "16.8", "11.0", "0.0", "26.8", "100.0", "308"], - ["", "3 children", "83.9", "71.2", "57.7", "0.8", "9.8", "0.6", "0.5", "1.8", "0.0", "12.7", "8.7", "3.3", "0.8", "16.1", "100.0", "961"], - ["", "1 or more sons", "85.0", "73.2", "60.3", "0.9", "9.4", "0.5", "0.5", "1.6", "0.0", "11.8", "8.1", "3.0", "0.7", "15.0", "100.0", "860"], - ["", "No sons", "74.7", "53.8", "35.3", "0.0", "13.7", "1.6", "0.0", "3.2", "0.0", "20.9", "13.4", "6.1", "1.5", "25.3", "100.0", "101"], - ["", "4+ children", "74.3", "58.1", "45.1", "0.6", "8.7", "0.6", "0.7", "2.4", "0.0", "16.1", "9.9", "5.4", "0.8", "25.7", "100.0", "944"], - ["", "1 or more sons", "73.9", "58.2", "46.0", "0.7", "8.3", "0.7", "0.7", "1.9", "0.0", "15.7", "9.4", "5.5", "0.8", "26.1", "100.0", "901"], - ["", "No sons", "(82.1)", "(57.3)", "(25.6)", "(0.0)", "(17.8)", "(0.0)", "(0.0)", "(13.9)", "(0.0)", "(24.8)", "(21.3)", "(3.5)", "(0.0)", "(17.9)", "100.0", "43"], - ["", "Total", "71.2", "49.9", "32.2", + ["1 child", "66.5", "32.1", "3.7", "0.7", "20.1", "0.7", "0.1", "6.9", "0.0", "34.3", "18.9", "15.2", "0.3", "33.5", "100.0", "1,190"], + ["1 son", "66.8", "33.2", "4.1", "0.7", "21.1", "0.5", "0.3", "6.6", "0.0", "33.5", "21.2", "12.3", "0.0", "33.2", "100.0", "672"], + ["No sons", "66.1", "30.7", "3.1", "0.6", "18.8", "0.8", "0.0", "7.3", "0.0", "35.4", "15.8", "19.0", "0.6", "33.9", "100.0", "517"], + ["2 children", "81.6", "60.5", "41.8", "0.9", "11.6", "0.8", "0.3", "4.8", "0.2", "21.1", "12.2", "8.3", "0.6", "18.4", "100.0", "1,576"], + ["1 or more sons", "83.7", "64.2", "46.4", "0.9", "10.8", "0.8", "0.4", "4.8", "0.1", "19.5", "11.1", "7.6", "0.7", "16.3", "100.0", "1,268"], + ["No sons", "73.2", "45.5", "23.2", "1.0", "15.1", "0.9", "0.0", "4.8", "0.5", "27.7", "16.8", "11.0", "0.0", "26.8", "100.0", "308"], + ["3 children", "83.9", "71.2", "57.7", "0.8", "9.8", "0.6", "0.5", "1.8", "0.0", "12.7", "8.7", "3.3", "0.8", "16.1", "100.0", "961"], + ["1 or more sons", "85.0", "73.2", "60.3", "0.9", "9.4", "0.5", "0.5", "1.6", "0.0", "11.8", "8.1", "3.0", "0.7", "15.0", "100.0", "860"], + ["No sons", "74.7", "53.8", "35.3", "0.0", "13.7", "1.6", "0.0", "3.2", "0.0", "20.9", "13.4", "6.1", "1.5", "25.3", "100.0", "101"], + ["4+ children", "74.3", "58.1", "45.1", "0.6", "8.7", "0.6", "0.7", "2.4", "0.0", "16.1", "9.9", "5.4", "0.8", "25.7", "100.0", "944"], + ["1 or more sons", "73.9", "58.2", "46.0", "0.7", "8.3", "0.7", "0.7", "1.9", "0.0", "15.7", "9.4", "5.5", "0.8", "26.1", "100.0", "901"], + ["No sons", "(82.1)", "(57.3)", "(25.6)", "(0.0)", "(17.8)", "(0.0)", "(0.0)", "(13.9)", "(0.0)", "(24.8)", "(21.3)", "(3.5)", "(0.0)", "(17.9)", "100.0", "43"], + ["Total", "71.2", "49.9", "32.2", "0.7", "11.7", "0.6", "0.3", "4.3", "0.1", "21.3", "12.3", "8.4", "0.5", "28.8", "100.0", "5,234"], - ["", "NFHS-2 (1998-99)", "66.6", "47.3", "32.0", "1.8", "9.2", "1.4", "na", "2.9", "na", "na", "8.7", "9.8", "na", "33.4", "100.0", "4,116"], - ["", "NFHS-1 (1992-93)", "57.7", "37.6", "26.5", "4.3", "3.6", "1.3", "0.1", "1.9", "na", "na", "11.3", "8.3", "na", "42.3", "100.0", "3,970"], - ["", "", "Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], - ["", "not shown separately.", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], - ["", "na = Not available", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], - ["", "", "ns = Not shown; see table 2b, footnote 1", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], - ["", "( ) Based on 25-49 unweighted cases.", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], - ["", "", "", "", "", "", "", "", "54", "", "", "", "", "", "", "", "", ""] + ["NFHS-2 (1998-99)", "66.6", "47.3", "32.0", "1.8", "9.2", "1.4", "na", "2.9", "na", "na", "8.7", "9.8", "na", "33.4", "100.0", "4,116"], + ["NFHS-1 (1992-93)", "57.7", "37.6", "26.5", "4.3", "3.6", "1.3", "0.1", "1.9", "na", "na", "11.3", "8.3", "na", "42.3", "100.0", "3,970"] ] data_stream_table_areas = [ @@ -187,14 +180,10 @@ ["", "", "", "", "1522 WEST LINDSEY", "", "", "", "", ""], ["632575", "BAW", "BASHU LEGENDS", "HYH HE CHUANG LLC", "STREET", "NORMAN", "OK", "73069", "-", "2014/07/21"], ["", "", "", "DEEP FORK HOLDINGS", "", "", "", "", "", ""], - ["543149", "BAW", "BEDLAM BAR-B-Q", "LLC", "610 NORTHEAST 50TH", "OKLAHOMA CITY", "OK", "73105", "(405) 528-7427", "2015/02/23"], - ["", "", "", "", "Page 1 of 151", "", "", "", "", ""] + ["543149", "BAW", "BEDLAM BAR-B-Q", "LLC", "610 NORTHEAST 50TH", "OKLAHOMA CITY", "OK", "73105", "(405) 528-7427", "2015/02/23"] ] data_stream_flag_size = [ - ["", "TABLE 125: STATE-WISE COMPOSITION OF OUTSTANDING LIABILITIES - 1997 (Contd.)", "", "", "", "", "", "", "", "", ""], - ["", "", "", "", "(As at end-March)", "", "", "", "", "", ""], - ["", "", "", "", "", "", "", "", "", "", "(` Billion)"], ["States", "Total", "Market", "NSSF", "WMA", "Loans", "Loans", "Loans", "Loans", "Loans", "Loans"], ["", "Internal", "Loans", "", "from", "from", "from", "from", "from", "from SBI", "from"], ["", "Debt", "", "", "RBI", "Banks", "LIC", "GIC", "NABARD", "& Other", "NCDC"], @@ -230,9 +219,7 @@ ["Uttar Pradesh", "80.62", "74.89", "-", "4.34", "1.34", "0.6", "-", "-0.21", "0.18", "0.03"], ["West Bengal", "34.23", "32.19", "-", "-", "2.04", "0.77", "-", "0.06", "-", "0.51"], ["NCT Delhi", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-"], - ["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"], - ["2 Includes `2.45 crore outstanding under “Market Loan Suspense”.", "", "", "", "", "", "", "", "", "", ""], - ["", "", "", "", "445", "", "", "", "", "", ""] + ["ALL STATES", "513.38", "436.02", "-", "25.57", "51.06", "14.18", "-", "8.21", "11.83", "11.08"] ] data_lattice = [ From 1f71513004e61a838035a39b31d223e9df478992 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 23 Nov 2018 19:28:55 +0530 Subject: [PATCH 12/13] Fix no table found warning and add tests for two tables --- Makefile | 2 +- camelot/parsers/stream.py | 15 ++++- setup.cfg | 2 +- tests/data.py | 125 ++++++++++++++++++++++++++++++++++++++ tests/test_common.py | 22 +++++++ 5 files changed, 162 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index d0b54b0e..383c8018 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ install: pip install ".[dev]" test: - pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests + pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl docs: cd docs && make html diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 79073ac2..178e0052 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -309,10 +309,21 @@ def _generate_columns_and_rows(self, table_idx, tk): cols.append(text_x_max) cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] else: + # calculate mode of the list of number of elements in + # each row to guess the number of columns ncols = max(set(elements), key=elements.count) if ncols == 1: - warnings.warn("No tables found on {}".format( - os.path.basename(self.rootname))) + # if mode is 1, the page usually contains not tables + # but there can be cases where the list can be skewed, + # try to remove all 1s from list in this case and + # see if the list contains elements, if yes, then use + # the mode after removing 1s + elements = list(filter(lambda x: x != 1, elements)) + if len(elements): + ncols = max(set(elements), key=elements.count) + else: + warnings.warn("No tables found in table area {}".format( + table_idx + 1)) cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol) inner_text = [] diff --git a/setup.cfg b/setup.cfg index 1a59858c..2c56c090 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,5 @@ test=pytest [tool:pytest] -addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests +addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl python_files = tests/test_*.py diff --git a/tests/data.py b/tests/data.py index c75a5887..4cc6f89a 100755 --- a/tests/data.py +++ b/tests/data.py @@ -74,6 +74,99 @@ ["NFHS-1 (1992-93)", "57.7", "37.6", "26.5", "4.3", "3.6", "1.3", "0.1", "1.9", "na", "na", "11.3", "8.3", "na", "42.3", "100.0", "3,970"] ] +data_stream_two_tables_1 = [ + ["[In thousands (11,062.6 represents 11,062,600) For year ending December 31. Based on Uniform Crime Reporting (UCR)", "", "", "", "", "", "", "", "", ""], + ["Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated", "", "", "", "", "", "", "", "", ""], + ["by the FBI. Some persons may be arrested more than once during a year, therefore, the data in this table, in some cases,", "", "", "", "", "", "", "", "", ""], + ["could represent multiple arrests of the same person. See text, this section and source]", "", "", "", "", "", "", "", "", ""], + ["", "", "Total", "", "", "Male", "", "", "Female", ""], + ["Offense charged", "", "Under 18", "18 years", "", "Under 18", "18 years", "", "Under 18", "18 years"], + ["", "Total", "years", "and over", "Total", "years", "and over", "Total", "years", "and over"], + ["Total . . . . . . . . . . . . . . . . . . . . . . . . .", "11,062 .6", "1,540 .0", "9,522 .6", "8,263 .3", "1,071 .6", "7,191 .7", "2,799 .2", "468 .3", "2,330 .9"], + ["Violent crime . . . . . . . . . . . . . . . . . .", "467 .9", "69 .1", "398 .8", "380 .2", "56 .5", "323 .7", "87 .7", "12 .6", "75 .2"], + ["Murder and nonnegligent", "", "", "", "", "", "", "", "", ""], + ["manslaughter . . . . . . . .. .. .. .. ..", "10.0", "0.9", "9.1", "9.0", "0.9", "8.1", "1.1", "–", "1.0"], + ["Forcible rape . . . . . . . .. .. .. .. .. .", "17.5", "2.6", "14.9", "17.2", "2.5", "14.7", "–", "–", "–"], + ["Robbery . . . .. .. . .. . ... . ... . ...", "102.1", "25.5", "76.6", "90.0", "22.9", "67.1", "12.1", "2.5", "9.5"], + ["Aggravated assault . . . . . . . .. .. ..", "338.4", "40.1", "298.3", "264.0", "30.2", "233.8", "74.4", "9.9", "64.5"], + ["Property crime . . . . . . . . . . . . . . . . .", "1,396 .4", "338 .7", "1,057 .7", "875 .9", "210 .8", "665 .1", "608 .2", "127 .9", "392 .6"], + ["Burglary . .. . . . . .. ... .... .... ..", "240.9", "60.3", "180.6", "205.0", "53.4", "151.7", "35.9", "6.9", "29.0"], + ["Larceny-theft . . . . . . . .. .. .. .. .. .", "1,080.1", "258.1", "822.0", "608.8", "140.5", "468.3", "471.3", "117.6", "353.6"], + ["Motor vehicle theft . . . . .. .. . .... .", "65.6", "16.0", "49.6", "53.9", "13.3", "40.7", "11.7", "2.7", "8.9"], + ["Arson .. . . . .. . ... .... .... .... .", "9.8", "4.3", "5.5", "8.1", "3.7", "4.4", "1.7", "0.6", "1.1"], + ["Other assaults .. . . . . .. . ... . ... ..", "1,061.3", "175.3", "886.1", "785.4", "115.4", "670.0", "276.0", "59.9", "216.1"], + ["Forgery and counterfeiting .. . . . . . ..", "68.9", "1.7", "67.2", "42.9", "1.2", "41.7", "26.0", "0.5", "25.5"], + ["Fraud .... .. . . .. ... .... .... ....", "173.7", "5.1", "168.5", "98.4", "3.3", "95.0", "75.3", "1.8", "73.5"], + ["Embezzlement . . .. . . . .. . ... . ....", "14.6", "–", "14.1", "7.2", "–", "6.9", "7.4", "–", "7.2"], + ["Stolen property 1 . . . . . . .. . .. .. ...", "84.3", "15.1", "69.2", "66.7", "12.2", "54.5", "17.6", "2.8", "14.7"], + ["Vandalism . . . . . . . .. .. .. .. .. ....", "217.4", "72.7", "144.7", "178.1", "62.8", "115.3", "39.3", "9.9", "29.4"], + ["Weapons; carrying, possessing, etc. .", "132.9", "27.1", "105.8", "122.1", "24.3", "97.8", "10.8", "2.8", "8.0"], + ["Prostitution and commercialized vice", + "56.9", "1.1", "55.8", "17.3", "–", "17.1", "39.6", "0.8", "38.7"], + ["Sex offenses 2 . . . . .. . . . .. .. .. . ..", "61.5", "10.7", "50.7", "56.1", "9.6", "46.5", "5.4", "1.1", "4.3"], + ["Drug abuse violations . . . . . . . .. ...", "1,333.0", "136.6", "1,196.4", "1,084.3", "115.2", "969.1", "248.7", "21.4", "227.3"], + ["Gambling .. . . . . .. ... . ... . ... ...", "8.2", "1.4", "6.8", "7.2", "1.4", "5.9", "0.9", "–", "0.9"], + ["Offenses against the family and", "", "", "", "", "", "", "", "", ""], + ["children . . . .. . . .. .. .. .. .. .. . ..", "92.4", "3.7", "88.7", "68.9", "2.4", "66.6", "23.4", "1.3", "22.1"], + ["Driving under the influence . . . . . .. .", "1,158.5", "109.2", "1,147.5", "895.8", "8.2", "887.6", "262.7", "2.7", "260.0"], + ["Liquor laws . . . . . . . .. .. .. .. .. .. .", "48.2", "90.2", "368.0", "326.8", "55.4", "271.4", + "131.4", "34.7", "96.6"], + ["Drunkenness . . .. . . . .. . ... . ... ..", "488.1", "11.4", "476.8", "406.8", "8.5", "398.3", "81.3", "2.9", "78.4"], + ["Disorderly conduct . .. . . . . . .. .. .. .", "529.5", "136.1", "393.3", "387.1", "90.8", "296.2", "142.4", "45.3", "97.1"], + ["Vagrancy . . . .. . . . ... .... .... ...", "26.6", "2.2", "24.4", "20.9", "1.6", "19.3", "5.7", "0.6", "5.1"], + ["All other offenses (except traffic) . . ..", "306.1", "263.4", "2,800.8", "2,337.1", "194.2", "2,142.9", "727.0", "69.2", "657.9"], + ["Suspicion . . . .. . . .. .. .. .. .. .. . ..", "1.6", "–", "1.4", "1.2", "–", "1.0", "–", "–", "–"], + ["Curfew and loitering law violations ..", "91.0", "91.0", "(X)", "63.1", "63.1", "(X)", "28.0", "28.0", "(X)"], + ["Runaways . . . . . . . .. .. .. .. .. ....", "75.8", "75.8", "(X)", "34.0", "34.0", "(X)", "41.8", "41.8", "(X)"], + ["", "– Represents zero. X Not applicable. 1 Buying, receiving, possessing stolen property. 2 Except forcible rape and prostitution.", "", "", "", "", "", "", "", ""], + ["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", "", "", "", "", "", "", "", ""] +] + +data_stream_two_tables_2 = [ + ["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", "", "", "", ""], + ["Table 325. Arrests by Race: 2009", "", "", "", "", ""], + ["[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies", "", "", "", "", ""], + ["with a total population of 239,839,971 as estimated by the FBI. See headnote, Table 324]", "", "", "", "", ""], + ["", "", "", "", "American", ""], + ["Offense charged", "", "", "", + "Indian/Alaskan", "Asian Pacific"], + ["", "Total", "White", "Black", "Native", "Islander"], + ["Total . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .", "10,690,561", "7,389,208", "3,027,153", "150,544", "123,656"], + ["Violent crime . . . . . . . . . . . . . . . . . . . . . . . . . . . .", "456,965", "268,346", "177,766", "5,608", "5,245"], + ["Murder and nonnegligent manslaughter . .. ... .", "9,739", "4,741", "4,801", "100", "97"], + ["Forcible rape . . . . . . . .. .. .. .. .... .. ...... .", "16,362", "10,644", "5,319", "169", "230"], + ["Robbery . . . . .. . . . ... . ... . .... .... .... . . .", "100,496", "43,039", "55,742", "726", "989"], + ["Aggravated assault . . . . . . . .. .. ...... .. ....", "330,368", "209,922", "111,904", "4,613", "3,929"], + ["Property crime . . . . . . . . . . . . . . . . . . . . . . . . . . .", "1,364,409", "922,139", "406,382", "17,599", "18,289"], + ["Burglary . . .. . . . .. . .... .... .... .... ... . . .", "234,551", "155,994", "74,419", "2,021", "2,117"], + ["Larceny-theft . . . . . . . .. .. .. .. .... .. ...... .", "1,056,473", "719,983", "306,625", "14,646", "15,219"], + ["Motor vehicle theft . . . . . .. ... . ... ..... ... ..", "63,919", "39,077", "23,184", "817", "841"], + ["Arson .. . . .. .. .. ... .... .... .... .... . . . . .", "9,466", "7,085", "2,154", "115", "112"], + ["Other assaults .. . . . . . ... . ... . ... ..... ... ..", "1,032,502", "672,865", "332,435", "15,127", "12,075"], + ["Forgery and counterfeiting .. . . . . . ... ..... .. ..", "67,054", "44,730", "21,251", "345", "728"], + ["Fraud ... . . . . .. .. .. .. .. .. .. .. .. .... . . . . . .", "161,233", "108,032", "50,367", "1,315", "1,519"], + ["Embezzlement . . . .. . . . ... . ... . .... ... .....", "13,960", "9,208", "4,429", "75", "248"], + ["Stolen property; buying, receiving, possessing .. .", "82,714", "51,953", "29,357", "662", "742"], + ["Vandalism . . . . . . . .. .. .. .. .. .. .... .. ..... .", "212,173", "157,723", "48,746", "3,352", "2,352"], + ["Weapons—carrying, possessing, etc. .. .. ... .. .", "130,503", "74,942", "53,441", "951", "1,169"], + ["Prostitution and commercialized vice . ... .. .. ..", "56,560", "31,699", "23,021", "427", "1,413"], + ["Sex offenses 1 . . . . . . . .. .. .. .. .... .. ...... .", "60,175", "44,240", "14,347", "715", "873"], + ["Drug abuse violations . . . . . . . .. . ..... .. .....", "1,301,629", "845,974", "437,623", "8,588", "9,444"], + ["Gambling . . . . .. . . . ... . ... . .. ... . ...... .. .", "8,046", "2,290", "5,518", "27", "211"], + ["Offenses against the family and children ... .. .. .", "87,232", "58,068", "26,850", "1,690", "624"], + ["Driving under the influence . . . . . . .. ... ...... .", "1,105,401", "954,444", "121,594", "14,903", "14,460"], + ["Liquor laws . . . . . . . .. .. .. .. .. . ..... .. .....", "444,087", "373,189", "50,431", "14,876", "5,591"], + ["Drunkenness . .. . . . . . ... . ... . ..... . .......", "469,958", "387,542", "71,020", "8,552", "2,844"], + ["Disorderly conduct . . .. . . . . .. .. . ..... .. .....", "515,689", "326,563", "176,169", "8,783", "4,174"], + ["Vagrancy . . .. .. . . .. ... .... .... .... .... . . .", "26,347", "14,581", "11,031", "543", "192"], + ["All other offenses (except traffic) . .. .. .. ..... ..", "2,929,217", "1,937,221", "911,670", "43,880", "36,446"], + ["Suspicion . . .. . . . .. .. .. .. .. .. .. ...... .. . . .", "1,513", "677", "828", "1", "7"], + ["Curfew and loitering law violations . .. ... .. ....", "89,578", "54,439", "33,207", "872", "1,060"], + ["Runaways . . . . . . . .. .. .. .. .. .. .... .. ..... .", "73,616", "48,343", "19,670", "1,653", "3,950"], + ["1 Except forcible rape and prostitution.", "", "", "", "", ""], + ["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,", "", "", "", ""] +] + data_stream_table_areas = [ ["", "One Withholding"], ["Payroll Period", "Allowance"], @@ -248,6 +341,38 @@ ["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"] ] +data_lattice_two_tables_1 = [ + ["State", "n", "Literacy Status", "", "", "", "", ""], + ["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College"], + ["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5"], + ["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2"], + ["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8"], + ["Andhra Pradesh", "2400", "54.0", "1.7", "8.4", "13.2", "18.8", "3.9"], + ["Maharashtra", "2400", "22.0", "0.9", "17.3", "20.3", "32.6", "7.0"], + ["Gujarat", "2390", "28.6", "0.1", "14.4", "23.1", "26.9", "6.8"], + ["Madhya Pradesh", "2402", "29.1", "3.4", "8.5", "35.1", "13.3", "10.6"], + ["Orissa", "2405", "33.2", "1.0", "10.4", "25.7", "21.2", "8.5"], + ["West Bengal", "2293", "41.7", "4.4", "13.2", "17.1", "21.2", "2.4"], + ["Uttar Pradesh", "2400", "35.3", "2.1", "4.5", "23.3", "27.1", "7.6"], + ["Pooled", "23889", "30.9", "1.9", "12.3", "23.2", "25.2", "6.4"] +] + +data_lattice_two_tables_2 = [ + ["State", "n", "Literacy Status", "", "", "", "", ""], + ["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College"], + ["Kerala", "2400", "8.8", "0.3", "20.1", "17.0", "45.6", "8.2"], + ["Tamil Nadu", "2400", "29.9", "1.5", "8.5", "33.1", "22.3", "4.8"], + ["Karnataka", "2399", "47.9", "2.5", "10.2", "18.8", "18.4", "2.3"], + ["Andhra Pradesh", "2400", "66.4", "0.7", "6.8", "12.9", "11.4", "1.8"], + ["Maharashtra", "2400", "41.3", "0.6", "14.1", "20.1", "21.6", "2.2"], + ["Gujarat", "2390", "57.6", "0.1", "10.3", "16.5", "12.9", "2.7"], + ["Madhya Pradesh", "2402", "58.7", "2.2", "6.6", "24.1", "5.3", "3.0"], + ["Orissa", "2405", "50.0", "0.9", "8.1", "21.9", "15.1", "4.0"], + ["West Bengal", "2293", "49.1", "4.8", "11.2", "16.8", "17.1", "1.1"], + ["Uttar Pradesh", "2400", "67.3", "2.0", "3.1", "17.2", "7.7", "2.7"], + ["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"] +] + data_lattice_table_areas = [ ["", "", "", "", "", "", "", "", ""], ["State", "n", "Literacy Status", "", "", "", "", "", ""], diff --git a/tests/test_common.py b/tests/test_common.py index bfd1ea64..708d61cd 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -56,6 +56,17 @@ def test_stream_table_rotated(): assert df.equals(tables[0].df) +def test_stream_two_tables(): + df1 = pd.DataFrame(data_stream_two_tables_1) + df2 = pd.DataFrame(data_stream_two_tables_2) + + filename = os.path.join(testdir, "tabula/12s0324.pdf") + tables = camelot.read_pdf(filename, flavor='stream') + assert len(tables) == 2 + assert df1.equals(tables[0].df) + assert df2.equals(tables[1].df) + + def test_stream_table_areas(): df = pd.DataFrame(data_stream_table_areas) @@ -111,6 +122,17 @@ def test_lattice_table_rotated(): assert df.equals(tables[0].df) +def test_lattice_two_tables(): + df1 = pd.DataFrame(data_lattice_two_tables_1) + df2 = pd.DataFrame(data_lattice_two_tables_2) + + filename = os.path.join(testdir, "twotables_2.pdf") + tables = camelot.read_pdf(filename) + assert len(tables) == 2 + assert df1.equals(tables[0].df) + assert df2.equals(tables[1].df) + + def test_lattice_table_areas(): df = pd.DataFrame(data_lattice_table_areas) From 23ec6b55f70d1a8ce99630cd984565e29eaf8092 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 23 Nov 2018 21:04:10 +0530 Subject: [PATCH 13/13] Add docstrings and update docs --- README.md | 4 +- camelot/__version__.py | 21 ++++++---- camelot/core.py | 52 ++++++++++++++++++++++++- camelot/parsers/stream.py | 11 ++++-- docs/dev/contributing.rst | 2 +- docs/index.rst | 3 +- docs/user/how-it-works.rst | 16 ++++---- docs/user/install-deps.rst | 76 ++++++++++++++++++++++++++++++++++++ docs/user/install.rst | 79 +++----------------------------------- 9 files changed, 165 insertions(+), 99 deletions(-) create mode 100755 docs/user/install-deps.rst diff --git a/README.md b/README.md index 93b72159..1d89c30d 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ $ conda install -c conda-forge camelot-py ### Using pip -After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot: +After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install-deps.html) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
 $ pip install camelot-py[cv]
@@ -128,4 +128,4 @@ Camelot uses [Semantic Versioning](https://semver.org/). For the available versi
 
 ## License
 
-This project is licensed under the MIT License, see the [LICENSE](https://github.com/socialcopsdev/camelot/blob/master/LICENSE) file for details.
\ No newline at end of file
+This project is licensed under the MIT License, see the [LICENSE](https://github.com/socialcopsdev/camelot/blob/master/LICENSE) file for details.
diff --git a/camelot/__version__.py b/camelot/__version__.py
index f19ff5e2..a48c9db0 100644
--- a/camelot/__version__.py
+++ b/camelot/__version__.py
@@ -1,18 +1,23 @@
 # -*- coding: utf-8 -*-
 
 VERSION = (0, 4, 0)
-PHASE = 'alpha' # alpha, beta or rc
-PHASE_VERSION = '1'
+PRERELEASE = None # alpha, beta or rc
+REVISION = None
+
+
+def generate_version(version, prerelease=None, revision=None):
+    version_parts = ['.'.join(map(str, version))]
+    if prerelease is not None:
+        version_parts.append('-{}'.format(prerelease))
+    if revision is not None:
+        version_parts.append('.{}'.format(revision))
+    return ''.join(version_parts)
+
 
 __title__ = 'camelot-py'
 __description__ = 'PDF Table Extraction for Humans.'
 __url__ = 'http://camelot-py.readthedocs.io/'
-if PHASE:
-    __version__ = '{}-{}'.format('.'.join(map(str, VERSION)), PHASE)
-    if PHASE_VERSION:
-        __version__ = '{}.{}'.format(__version__, PHASE_VERSION)
-else:
-    __version__ = '.'.join(map(str, VERSION))
+__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
 __author__ = 'Vinayak Mehta'
 __author_email__ = 'vmehta94@gmail.com'
 __license__ = 'MIT License'
diff --git a/camelot/core.py b/camelot/core.py
index cc0b5a3b..f11fcc11 100644
--- a/camelot/core.py
+++ b/camelot/core.py
@@ -20,6 +20,29 @@
 
 
 class TextEdge(object):
+    """Defines a text edge coordinates relative to a left-bottom
+    origin. (PDF coordinate space)
+
+    Parameters
+    ----------
+    x : float
+        x-coordinate of the text edge.
+    y0 : float
+        y-coordinate of bottommost point.
+    y1 : float
+        y-coordinate of topmost point.
+    align : string, optional (default: 'left')
+        {'left', 'right', 'middle'}
+
+    Attributes
+    ----------
+    intersections: int
+        Number of intersections with horizontal text rows.
+    is_valid: bool
+        A text edge is valid if it intersections with at least
+        TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
+
+    """
     def __init__(self, x, y0, y1, align='left'):
         self.x = x
         self.y0 = y0
@@ -33,6 +56,9 @@ def __repr__(self):
             round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
 
     def update_coords(self, x, y0):
+        """Updates the text edge's x and bottom y coordinates and sets
+        the is_valid attribute.
+        """
         if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
             self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
             self.y0 = y0
@@ -44,11 +70,18 @@ def update_coords(self, x, y0):
 
 
 class TextEdges(object):
+    """Defines a dict of left, right and middle text edges found on
+    the PDF page. The dict has three keys based on the alignments,
+    and each key's value is a list of camelot.core.TextEdge objects.
+    """
     def __init__(self):
-        self._textedges = {'left': [], 'middle': [], 'right': []}
+        self._textedges = {'left': [], 'right': [], 'middle': []}
 
     @staticmethod
     def get_x_coord(textline, align):
+        """Returns the x coordinate of a text row based on the
+        specified alignment.
+        """
         x_left = textline.x0
         x_right = textline.x1
         x_middle = x_left + (x_right - x_left) / 2.0
@@ -56,12 +89,17 @@ def get_x_coord(textline, align):
         return x_coord[align]
 
     def find(self, x_coord, align):
+        """Returns the index of an existing text edge using
+        the specified x coordinate and alignment.
+        """
         for i, te in enumerate(self._textedges[align]):
             if np.isclose(te.x, x_coord, atol=0.5):
                 return i
         return None
 
     def add(self, textline, align):
+        """Adds a new text edge to the current dict.
+        """
         x = self.get_x_coord(textline, align)
         y0 = textline.y0
         y1 = textline.y1
@@ -69,6 +107,8 @@ def add(self, textline, align):
         self._textedges[align].append(te)
 
     def update(self, textline):
+        """Updates an existing text edge in the current dict.
+        """
         for align in ['left', 'right', 'middle']:
             x_coord = self.get_x_coord(textline, align)
             idx = self.find(x_coord, align)
@@ -78,11 +118,18 @@ def update(self, textline):
                 self._textedges[align][idx].update_coords(x_coord, textline.y0)
 
     def generate(self, textlines):
+        """Generates the text edges dict based on horizontal text
+        rows.
+        """
         for tl in textlines:
             if len(tl.get_text().strip()) > 1: # TODO: hacky
                 self.update(tl)
 
     def get_relevant(self):
+        """Returns the list of relevant text edges (all share the same
+        alignment) based on which list intersects horizontal text rows
+        the most.
+        """
         intersections_sum = {
             'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid),
             'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid),
@@ -96,6 +143,9 @@ def get_relevant(self):
         return self._textedges[relevant_align]
 
     def get_table_areas(self, textlines, relevant_textedges):
+        """Returns a dict of interesting table areas on the PDF page
+        calculated using relevant text edges.
+        """
         def pad(area, average_row_height):
             x0 = area[0] - TABLE_AREA_PADDING
             y0 = area[1] - TABLE_AREA_PADDING
diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
index 178e0052..3b9c0683 100644
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@@ -247,10 +247,13 @@ def _validate_columns(self):
                                  " should be equal")
 
     def _nurminen_table_detection(self, textlines):
-        # a general heuristic implementation of the table detection
-        # algorithm described by Anssi Nurminen's master's thesis:
-        # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
-        # assumes that tables are situated relatively apart vertically
+        """A general implementation of the table detection algorithm
+        described by Anssi Nurminen's master's thesis.
+        Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
+
+        Assumes that tables are situated relatively far apart
+        vertically.
+        """
 
         # TODO: add support for arabic text #141
         # sort textlines in reading order
diff --git a/docs/dev/contributing.rst b/docs/dev/contributing.rst
index 1ecaee0b..21cdb363 100644
--- a/docs/dev/contributing.rst
+++ b/docs/dev/contributing.rst
@@ -7,7 +7,7 @@ If you're reading this, you're probably looking to contributing to Camelot. *Tim
 
 This document will help you get started with contributing documentation, code, testing and filing issues. If you have any questions, feel free to reach out to `Vinayak Mehta`_, the author and maintainer.
 
-.. _Vinayak Mehta: https://vinayak-mehta.github.io
+.. _Vinayak Mehta: https://www.vinayakmehta.com
 
 Code Of Conduct
 ---------------
diff --git a/docs/index.rst b/docs/index.rst
index 2d695105..4c2bf079 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -92,6 +92,7 @@ This part of the documentation begins with some background information about why
    :maxdepth: 2
 
    user/intro
+   user/install-deps
    user/install
    user/how-it-works
    user/quickstart
@@ -118,4 +119,4 @@ you.
 .. toctree::
    :maxdepth: 2
 
-   dev/contributing
\ No newline at end of file
+   dev/contributing
diff --git a/docs/user/how-it-works.rst b/docs/user/how-it-works.rst
index 0783c60c..13004daa 100644
--- a/docs/user/how-it-works.rst
+++ b/docs/user/how-it-works.rst
@@ -5,24 +5,24 @@ How It Works
 
 This part of the documentation includes a high-level explanation of how Camelot extracts tables from PDF files.
 
-You can choose between two table parsing methods, *Stream* and *Lattice*. These names for parsing methods inside Camelot were inspired from `Tabula`_.
-
-.. _Tabula: https://github.com/tabulapdf/tabula
+You can choose between two table parsing methods, *Stream* and *Lattice*. These names for parsing methods inside Camelot were inspired from `Tabula `_.
 
 .. _stream:
 
 Stream
 ------
 
-Stream can be used to parse tables that have whitespaces between cells to simulate a table structure. It looks for these spaces between text to form a table representation.
+Stream can be used to parse tables that have whitespaces between cells to simulate a table structure. It is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences, using `margins `_.
+
+1. Words on the PDF page are grouped into text rows based on their *y* axis overlaps.
 
-It is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences, using `margins`_. After getting the words on a page, it groups them into rows based on their *y* coordinates. It then tries to guess the number of columns the table might have by calculating the mode of the number of words in each row. This mode is used to calculate *x* ranges for the table's columns. It then adds columns to this column range list based on any words that may lie outside or inside the current column *x* ranges.
+2. Textedges are calculated and then used to guess interesting table areas on the PDF page. You can read `Anssi Nurminen's master's thesis `_ to know more about this table detection technique. [See pages 20, 35 and 40]
 
-.. _margins: https://euske.github.io/pdfminer/#tools
+3. The number of columns inside each table area are then guessed. This is done by calculating the mode of number of words in each text row. Based on this mode, words in each text row are chosen to calculate a list of column *x* ranges.
 
-.. note:: By default, Stream treats the whole PDF page as a table, which isn't ideal when there are more than two tables on a page with different number of columns. Automatic table detection for Stream is `in the works`_.
+4. Words that lie inside/outside the current column *x* ranges are then used to extend extend the current list of columns.
 
-.. _in the works: https://github.com/socialcopsdev/camelot/issues/102
+5. Finally, a table is formed using the text rows' *y* ranges and column *x* ranges and words found on the page are assigned to the table's cells based on their *x* and *y* coordinates.
 
 .. _lattice:
 
diff --git a/docs/user/install-deps.rst b/docs/user/install-deps.rst
new file mode 100755
index 00000000..287af3a6
--- /dev/null
+++ b/docs/user/install-deps.rst
@@ -0,0 +1,76 @@
+.. _install_deps:
+
+Installation of dependencies
+============================
+
+The dependencies `Tkinter`_ and `ghostscript`_ can be installed using your system's package manager. You can run one of the following, based on your OS.
+
+.. _Tkinter: https://wiki.python.org/moin/TkInter
+.. _ghostscript: https://www.ghostscript.com
+
+OS-specific instructions
+------------------------
+
+For Ubuntu
+^^^^^^^^^^
+::
+
+    $ apt install python-tk ghostscript
+
+Or for Python 3::
+
+    $ apt install python3-tk ghostscript
+
+For macOS
+^^^^^^^^^
+::
+
+    $ brew install tcl-tk ghostscript
+
+For Windows
+^^^^^^^^^^^
+
+For Tkinter, you can download the `ActiveTcl Community Edition`_ from ActiveState. For ghostscript, you can get the installer at the `ghostscript downloads page`_.
+
+After installing ghostscript, you'll need to reboot your system to make sure that the ghostscript executable's path is in the windows PATH environment variable. In case you don't want to reboot, you can manually add the ghostscript executable's path to the PATH variable, `as shown here`_.
+
+.. _ActiveTcl Community Edition: https://www.activestate.com/activetcl/downloads
+.. _ghostscript downloads page: https://www.ghostscript.com/download/gsdnld.html
+.. _as shown here: https://java.com/en/download/help/path.xml
+
+Checks to see if dependencies were installed correctly
+------------------------------------------------------
+
+You can do the following checks to see if the dependencies were installed correctly.
+
+For Tkinter
+^^^^^^^^^^^
+
+Launch Python, and then at the prompt, type::
+
+    >>> import Tkinter
+
+Or in Python 3::
+
+    >>> import tkinter
+
+If you have Tkinter, Python will not print an error message, and if not, you will see an ``ImportError``.
+
+For ghostscript
+^^^^^^^^^^^^^^^
+
+Run the following to check the ghostscript version.
+
+For Ubuntu/macOS::
+
+    $ gs -version
+
+For Windows::
+
+    C:\> gswin64c.exe -version
+
+Or for Windows 32-bit::
+
+    C:\> gswin32c.exe -version
+
+If you have ghostscript, you should see the ghostscript version and copyright information.
diff --git a/docs/user/install.rst b/docs/user/install.rst
index e28e5469..fc9fc827 100644
--- a/docs/user/install.rst
+++ b/docs/user/install.rst
@@ -3,7 +3,7 @@
 Installation of Camelot
 =======================
 
-This part of the documentation covers how to install Camelot.
+This part of the documentation covers the steps to install Camelot.
 
 Using conda
 -----------
@@ -23,84 +23,17 @@ The easiest way to install Camelot is to install it with `conda`_, which is a pa
 Using pip
 ---------
 
-First, you'll need to install the dependencies, which include `Tkinter`_ and `ghostscript`_.
+After :ref:`installing the dependencies `, which include `Tkinter`_ and `ghostscript`_, you can simply use pip to install Camelot::
+
+    $ pip install camelot-py[cv]
 
 .. _Tkinter: https://wiki.python.org/moin/TkInter
 .. _ghostscript: https://www.ghostscript.com
 
-These can be installed using your system's package manager. You can run one of the following, based on your OS.
-
-For Ubuntu
-^^^^^^^^^^
-::
-
-    $ apt install python-tk ghostscript
-
-Or for Python 3::
-
-    $ apt install python3-tk ghostscript
-
-For macOS
-^^^^^^^^^
-::
-
-    $ brew install tcl-tk ghostscript
-
-For Windows
-^^^^^^^^^^^
-
-For Tkinter, you can download the `ActiveTcl Community Edition`_ from ActiveState. For ghostscript, you can get the installer at the `ghostscript downloads page`_.
-
-After installing ghostscript, you'll need to reboot your system to make sure that the ghostscript executable's path is in the windows PATH environment variable. In case you don't want to reboot, you can manually add the ghostscript executable's path to the PATH variable, `as shown here`_.
-
-.. _ActiveTcl Community Edition: https://www.activestate.com/activetcl/downloads
-.. _ghostscript downloads page: https://www.ghostscript.com/download/gsdnld.html
-.. _as shown here: https://java.com/en/download/help/path.xml
-
-----
-
-You can do the following checks to see if the dependencies were installed correctly.
-
-For Tkinter
-^^^^^^^^^^^
-
-Launch Python, and then at the prompt, type::
-
-    >>> import Tkinter
-
-Or in Python 3::
-
-    >>> import tkinter
-
-If you have Tkinter, Python will not print an error message, and if not, you will see an ``ImportError``.
-
-For ghostscript
-^^^^^^^^^^^^^^^
-
-Run the following to check the ghostscript version.
-
-For Ubuntu/macOS::
-
-    $ gs -version
-
-For Windows::
-
-    C:\> gswin64c.exe -version
-
-Or for Windows 32-bit::
-
-    C:\> gswin32c.exe -version
-
-If you have ghostscript, you should see the ghostscript version and copyright information.
-
-Finally, you can use pip to install Camelot::
-
-    $ pip install camelot-py[cv]
-
 From the source code
 --------------------
 
-After `installing the dependencies`_, you can install from the source by:
+After :ref:`installing the dependencies `, you can install from the source by:
 
 1. Cloning the GitHub repository.
 ::
@@ -112,5 +45,3 @@ After `installing the dependencies`_, you can install from the source by:
 
     $ cd camelot
     $ pip install ".[cv]"
-
-.. _installing the dependencies: https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip