atlanhq · deepankurtaneja · May 24, 2019 · May 27, 2019
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,6 @@ coverage.xml
 _build/
 
 # vscode
-.vscode
+.vscode
+
+.idea/
diff --git a/camelot/__init__.py b/camelot/__init__.py
@@ -6,6 +6,7 @@
 
 from .__version__ import __version__
 from .io import read_pdf
+from .io import read_fileObj
 from .plotting import PlotMethods
 
 

diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -27,28 +27,42 @@ class PDFHandler(object):
  Password for decryption.
 
  """
- def __init__(self, filepath, pages='1', password=None):
- if is_url(filepath):
- filepath = download_url(filepath)
- self.filepath = filepath
- if not filepath.lower().endswith('.pdf'):
- raise NotImplementedError("File format not supported")
-
- if password is None:
- self.password = ''
- else:
- self.password = password
- if sys.version_info[0] < 3:
- self.password = self.password.encode('ascii')
- self.pages = self._get_pages(self.filepath, pages)
+ def __init__(self, filepath="",fileObj="", pages='1', password=None):
+ if filepath != "":
+ if is_url(filepath):
+ filepath = download_url(filepath)
+ self.filepath = filepath
+ self.fileObj = ""
+ if not filepath.lower().endswith('.pdf'):
+ raise NotImplementedError("File format not supported")
+
+ if password is None:
+ self.password = ''
+ else:
+ self.password = password
+ if sys.version_info[0] < 3:
+ self.password = self.password.encode('ascii')
+ self.pages = self._get_pages(filepath=self.filepath, pages=pages)
+ if fileObj != "":
+ self.fileObj = fileObj
+ self.filepath = ""
+ if password is None:
+ self.password = ''
+ else:
+ self.password = password
+ if sys.version_info[0] < 3:
+ self.password = self.password.encode('ascii')
+ self.pages = self._get_pages(fileObj=self.fileObj, pages=pages)
 
- def _get_pages(self, filepath, pages):
+ def _get_pages(self, filepath="", pages='1',fileObj=""):
  """Converts pages string to list of ints.
 
  Parameters
  ----------
  filepath : str
  Filepath or URL of the PDF file.
+ fileObj : str
+ File Object of the PDF file.
  pages : str, optional (default: '1')
  Comma-separated page numbers.
  Example: '1,3,4' or '1,4-end' or 'all'.
@@ -63,7 +77,10 @@ def _get_pages(self, filepath, pages):
  if pages == '1':
  page_numbers.append({'start': 1, 'end': 1})
  else:
- infile = PdfFileReader(open(filepath, 'rb'), strict=False)
+ if filepath:
+ infile = PdfFileReader(open(filepath, 'rb'), strict=False)
+ if fileObj:
+ infile = PdfFileReader(fileObj, strict=False)
  if infile.isEncrypted:
  infile.decrypt(self.password)
  if pages == 'all':
@@ -82,7 +99,7 @@ def _get_pages(self, filepath, pages):
  P.extend(range(p['start'], p['end'] + 1))
  return sorted(set(P))
 
- def _save_page(self, filepath, page, temp):
+ def _save_page(self, filepath, page='1', temp=''):
  """Saves specified page from PDF into a temporary directory.
 
  Parameters
@@ -95,6 +112,7 @@ def _save_page(self, filepath, page, temp):
  Tmp directory.
 
  """
+
  with open(filepath, 'rb') as fileobj:
  infile = PdfFileReader(fileobj, strict=False)
  if infile.isEncrypted:
@@ -128,6 +146,52 @@ def _save_page(self, filepath, page, temp):
  with open(fpath, 'wb') as f:
  outfile.write(f)
 
+ def _save_page_new(self, fileObj, page='1', temp=''):
+ """Saves specified page from PDF into a temporary directory.
+
+ Parameters
+ ----------
+ fileObj : str
+ File Object of the PDF file.
+ page : int
+ Page number.
+ temp : str
+ Tmp directory.
+
+ """
+ fileobj = fileObj
+ infile = PdfFileReader(fileobj, strict=False)
+ if infile.isEncrypted:
+ infile.decrypt(self.password)
+ fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
+ froot, fext = os.path.splitext(fpath)
+ p = infile.getPage(page - 1)
+ outfile = PdfFileWriter()
+ outfile.addPage(p)
+ with open(fpath, 'wb') as f:
+ outfile.write(f)
+ layout, dim = get_page_layout(fpath)
+ # fix rotated PDF
+ chars = get_text_objects(layout, ltype="char")
+ horizontal_text = get_text_objects(layout, ltype="horizontal_text")
+ vertical_text = get_text_objects(layout, ltype="vertical_text")
+ rotation = get_rotation(chars, horizontal_text, vertical_text)
+ if rotation != '':
+ fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
+ os.rename(fpath, fpath_new)
+ infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
+ if infile.isEncrypted:
+ infile.decrypt(self.password)
+ outfile = PdfFileWriter()
+ p = infile.getPage(0)
+ if rotation == 'anticlockwise':
+ p.rotateClockwise(90)
+ elif rotation == 'clockwise':
+ p.rotateCounterClockwise(90)
+ outfile.addPage(p)
+ with open(fpath, 'wb') as f:
+ outfile.write(f)
+
  def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs):
  """Extracts tables by calling parser.get_tables on all single
  page PDFs.
@@ -153,7 +217,10 @@ def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwa
  tables = []
  with TemporaryDirectory() as tempdir:
  for p in self.pages:
- self._save_page(self.filepath, p, tempdir)
+ if self.filepath != "":
+ self._save_page(filepath=self.filepath, page=p, temp=tempdir)
+ if self.fileObj != "":
+ self._save_page_new(fileObj=self.fileObj, page=p, temp=tempdir)
  pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
  for p in self.pages]
  parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)

diff --git a/camelot/io.py b/camelot/io.py
@@ -105,3 +105,104 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
  tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
  layout_kwargs=layout_kwargs, **kwargs)
  return tables
+
+
+def read_fileObj(fileObj, pages='1', password=None, flavor='lattice', suppress_stdout=False, layout_kwargs={},
+ **kwargs):
+ """Read PDF and return extracted tables.
+
+ Note: kwargs annotated with ^ can only be used with flavor='stream'
+ and kwargs annotated with * can only be used with flavor='lattice'.
+
+ Parameters
+ ----------
+ fileObj : str
+ File Object of the PDF file.
+ pages : str, optional (default: '1')
+ Comma-separated page numbers.
+ Example: '1,3,4' or '1,4-end' or 'all'.
+ password : str, optional (default: None)
+ Password for decryption.
+ flavor : str (default: 'lattice')
+ The parsing method to use ('lattice' or 'stream').
+ Lattice is used by default.
+ suppress_stdout : bool, optional (default: True)
+ Print all logs and warnings.
+ layout_kwargs : dict, optional (default: {})
+ A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+ table_areas : list, optional (default: None)
+ List of table area strings of the form x1,y1,x2,y2
+ where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+ in PDF coordinate space.
+ columns^ : list, optional (default: None)
+ List of column x-coordinates strings where the coordinates
+ are comma-separated.
+ split_text : bool, optional (default: False)
+ Split text that spans across multiple cells.
+ flag_size : bool, optional (default: False)
+ Flag text based on font size. Useful to detect
+ super/subscripts. Adds <s></s> around flagged text.
+ strip_text : str, optional (default: '')
+ Characters that should be stripped from a string before
+ assigning it to a cell.
+ row_tol^ : int, optional (default: 2)
+ Tolerance parameter used to combine text vertically,
+ to generate rows.
+ column_tol^ : int, optional (default: 0)
+ Tolerance parameter used to combine text horizontally,
+ to generate columns.
+ process_background* : bool, optional (default: False)
+ Process background lines.
+ line_scale* : int, optional (default: 15)
+ Line size scaling factor. The larger the value the smaller
+ the detected lines. Making it very large will lead to text
+ being detected as lines.
+ copy_text* : list, optional (default: None)
+ {'h', 'v'}
+ Direction in which text in a spanning cell will be copied
+ over.
+ shift_text* : list, optional (default: ['l', 't'])
+ {'l', 'r', 't', 'b'}
+ Direction in which text in a spanning cell will flow.
+ line_tol* : int, optional (default: 2)
+ Tolerance parameter used to merge close vertical and horizontal
+ lines.
+ joint_tol* : int, optional (default: 2)
+ Tolerance parameter used to decide whether the detected lines
+ and points lie close to each other.
+ threshold_blocksize* : int, optional (default: 15)
+ Size of a pixel neighborhood that is used to calculate a
+ threshold value for the pixel: 3, 5, 7, and so on.
+
+ For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+ threshold_constant* : int, optional (default: -2)
+ Constant subtracted from the mean or weighted mean.
+ Normally, it is positive but may be zero or negative as well.
+
+ For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+ iterations* : int, optional (default: 0)
+ Number of times for erosion/dilation is applied.
+
+ For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
+ resolution* : int, optional (default: 300)
+ Resolution used for PDF to PNG conversion.
+
+ Returns
+ -------
+ tables : camelot.core.TableList
+
+ """
+ if flavor not in ['lattice', 'stream']:
+ raise NotImplementedError("Unknown flavor specified."
+ " Use either 'lattice' or 'stream'")
+
+ with warnings.catch_warnings():
+ if suppress_stdout:
+ warnings.simplefilter("ignore")
+
+ validate_input(kwargs, flavor=flavor)
+ p = PDFHandler(fileObj=fileObj, pages=pages, password=password)
+ kwargs = remove_extra(kwargs, flavor=flavor)
+ tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,
+ layout_kwargs=layout_kwargs, **kwargs)
+ return tables