o
    fl                  #   @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlm Z m!Z!m"Z" 													d-dede"de#de#de
e de$de
ee$  de#de%de$de#de
e# de&d e&d!e&d"ed#df"d$d%Z'				&		d.d'e!de#de
ee$  de$d(e&de#de
e d#e#fd)d*Z(				&	d/d'e!de#de
ee$  de$d(e&de
e d#e	e fd+d,Z)dS )0zIFunctions that can be used for the most common use-cases for pdfminer.six    N)StringIO)AnyBinaryIO	ContainerIteratorOptionalcast   )XMLConverterHTMLConverterTextConverterPDFPageAggregatorHOCRConverter)ImageWriter)LAParamsLTPage)	PDFDeviceTagExtractor)PDFResourceManagerPDFPageInterpreter)PDFPage)open_filename
FileOrNameAnyIOtextutf-8       ?normalFinfoutfpoutput_typecodeclaparamsmaxpagespage_numberspasswordscalerotation
layoutmode
output_dirstrip_controldebugdisable_cachingkwargsreturnc              	   K   sL  |r
t  t j d}|rt|}t| d}d}|dkr'|tjkr'tjj}|dkr5t	|||||d}nE|dkrDt
||||||d}n6|dkrTt|||||
||d}n&|d	krbt|||||d
}n|dkrqt|tt||d}n	d| }t||dusJ t||}tj| |||| dD ]}|j|	 d |_|| q|  dS )ak  Parses text from inf-file and writes to outfp file-like object.

    Takes loads of optional arguments but the defaults are somewhat sane.
    Beware laparams: Including an empty LAParams is not the same as passing
    None!

    :param inf: a file-like object to read PDF structure from, such as a
        file handler (using the builtin `open()` function) or a `BytesIO`.
    :param outfp: a file-like object to write the text to.
    :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
        Only 'text' works properly.
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. Default is None
        but may not layout correctly.
    :param maxpages: How many pages to stop parsing after
    :param page_numbers: zero-indexed page numbers to operate on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param scale: Scale factor
    :param rotation: Rotation factor
    :param layoutmode: Default is 'normal', see
        pdfminer.converter.HTMLConverter
    :param output_dir: If given, creates an ImageWriter for extracted images.
    :param strip_control: Does what it says on the tin
    :param debug: Output more logging data
    :param disable_caching: Does what it says on the tin
    :param other:
    :return: nothing, acting as it does on two streams. Use StringIO to get
        strings.
    Ncachingr   )r"   r#   imagewriterxml)r"   r#   r2   stripcontrolhtml)r"   r'   r)   r#   r2   hocr)r"   r#   r4   tag)r"   z1Output type can be text, html, xml or tag but is r$   r&   r1   ih  )logging	getLoggersetLevelDEBUGr   r   sysstdoutbufferr   r
   r   r   r   r   r   
ValueErrorr   r   	get_pagesrotateprocess_pageclose)r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r2   rsrcmgrdevicemsginterpreterpage rJ   L/home/ubuntu/webapp/venv/lib/python3.10/site-packages/pdfminer/high_level.pyextract_text_to_fp   sh   /
	




rL   Tpdf_filer1   c              	   C   s   |du rt  }t| dP}t <}tt|}t|d}	t|	|||d}
t|	|
}tj	|||||dD ]}|
| q2| W  d   W  d   S 1 sOw   Y  W d   dS 1 s_w   Y  dS )aw  Parse and return the text contained in a PDF file.

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a string containing all of the text extracted.
    Nrbr0   )r"   r#   r8   )r   r   r   r   r   r   r   r   r   rA   rC   getvalue)rM   r&   r%   r$   r1   r"   r#   fpoutput_stringrE   rF   rH   rI   rJ   rJ   rK   extract_text   s"   



RrR   c                 c   s    |du rt  }t| d7}tt|}t|d}t||d}t||}	tj|||||dD ]}
|		|
 |
 }|V  q-W d   dS 1 sGw   Y  dS )a  Extract and yield LTPage objects

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: LTPage objects
    NrN   r0   )r#   r8   )r   r   r   r   r   r   r   r   rA   rC   
get_result)rM   r&   r%   r$   r1   r#   rP   resource_managerrF   rH   rI   layoutrJ   rJ   rK   extract_pages   s    





"rV   )r   r   Nr   Nr   r   r   r   NFFF)r   Nr   Tr   N)r   Nr   TN)*__doc__r9   r=   ior   typingr   r   r   r   r   r   	converterr
   r   r   r   r   imager   rU   r   r   	pdfdevicer   r   	pdfinterpr   r   pdfpager   utilsr   r   r   strintfloatboolrL   rR   rV   rJ   rJ   rJ   rK   <module>   s     
	

t

-
