Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added fileObj to read file object #331

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,6 @@ coverage.xml
_build/

# vscode
.vscode
.vscode

.idea/
1 change: 1 addition & 0 deletions camelot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from .__version__ import __version__
from .io import read_pdf
from .io import read_fileObj
from .plotting import PlotMethods


Expand Down
103 changes: 85 additions & 18 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,42 @@ class PDFHandler(object):
Password for decryption.

"""
def __init__(self, filepath, pages='1', password=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath = filepath
if not filepath.lower().endswith('.pdf'):
raise NotImplementedError("File format not supported")

if password is None:
self.password = ''
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode('ascii')
self.pages = self._get_pages(self.filepath, pages)
def __init__(self, filepath="",fileObj="", pages='1', password=None):
if filepath != "":
if is_url(filepath):
filepath = download_url(filepath)
self.filepath = filepath
self.fileObj = ""
if not filepath.lower().endswith('.pdf'):
raise NotImplementedError("File format not supported")

if password is None:
self.password = ''
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode('ascii')
self.pages = self._get_pages(filepath=self.filepath, pages=pages)
if fileObj != "":
self.fileObj = fileObj
self.filepath = ""
if password is None:
self.password = ''
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode('ascii')
self.pages = self._get_pages(fileObj=self.fileObj, pages=pages)

def _get_pages(self, filepath, pages):
def _get_pages(self, filepath="", pages='1',fileObj=""):
"""Converts pages string to list of ints.

Parameters
----------
filepath : str
Filepath or URL of the PDF file.
fileObj : str
File Object of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -63,7 +77,10 @@ def _get_pages(self, filepath, pages):
if pages == '1':
page_numbers.append({'start': 1, 'end': 1})
else:
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
if filepath:
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
if fileObj:
infile = PdfFileReader(fileObj, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
if pages == 'all':
Expand All @@ -82,7 +99,7 @@ def _get_pages(self, filepath, pages):
P.extend(range(p['start'], p['end'] + 1))
return sorted(set(P))

def _save_page(self, filepath, page, temp):
def _save_page(self, filepath, page='1', temp=''):
"""Saves specified page from PDF into a temporary directory.

Parameters
Expand All @@ -95,6 +112,7 @@ def _save_page(self, filepath, page, temp):
Tmp directory.

"""

with open(filepath, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted:
Expand Down Expand Up @@ -128,6 +146,52 @@ def _save_page(self, filepath, page, temp):
with open(fpath, 'wb') as f:
outfile.write(f)

def _save_page_new(self, fileObj, page='1', temp=''):
"""Saves specified page from PDF into a temporary directory.

Parameters
----------
fileObj : str
File Object of the PDF file.
page : int
Page number.
temp : str
Tmp directory.

"""
fileobj = fileObj
infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
outfile = PdfFileWriter()
outfile.addPage(p)
with open(fpath, 'wb') as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != '':
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == 'anticlockwise':
p.rotateClockwise(90)
elif rotation == 'clockwise':
p.rotateCounterClockwise(90)
outfile.addPage(p)
with open(fpath, 'wb') as f:
outfile.write(f)

def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
Expand All @@ -153,7 +217,10 @@ def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwa
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filepath, p, tempdir)
if self.filepath != "":
self._save_page(filepath=self.filepath, page=p, temp=tempdir)
if self.fileObj != "":
self._save_page_new(fileObj=self.fileObj, page=p, temp=tempdir)
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages]
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
Expand Down
101 changes: 101 additions & 0 deletions camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,104 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs, **kwargs)
return tables


def read_fileObj(fileObj, pages='1', password=None, flavor='lattice', suppress_stdout=False, layout_kwargs={},
**kwargs):
"""Read PDF and return extracted tables.

Note: kwargs annotated with ^ can only be used with flavor='stream'
and kwargs annotated with * can only be used with flavor='lattice'.

Parameters
----------
fileObj : str
File Object of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
suppress_stdout : bool, optional (default: True)
Print all logs and warnings.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns^ : list, optional (default: None)
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
row_tol^ : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
column_tol^ : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
process_background* : bool, optional (default: False)
Process background lines.
line_scale* : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text
being detected as lines.
copy_text* : list, optional (default: None)
{'h', 'v'}
Direction in which text in a spanning cell will be copied
over.
shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Direction in which text in a spanning cell will flow.
line_tol* : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal
lines.
joint_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize* : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.

For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
threshold_constant* : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.

For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations* : int, optional (default: 0)
Number of times for erosion/dilation is applied.

For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
resolution* : int, optional (default: 300)
Resolution used for PDF to PNG conversion.

Returns
-------
tables : camelot.core.TableList

"""
if flavor not in ['lattice', 'stream']:
raise NotImplementedError("Unknown flavor specified."
" Use either 'lattice' or 'stream'")

with warnings.catch_warnings():
if suppress_stdout:
warnings.simplefilter("ignore")

validate_input(kwargs, flavor=flavor)
p = PDFHandler(fileObj=fileObj, pages=pages, password=password)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
layout_kwargs=layout_kwargs, **kwargs)
return tables