o
    f_                     @   s  d dl Z d dlZd dlmZmZmZmZmZmZm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddlm!Z! ddlm"Z" ddlm#Z# ddlm$Z$ ddlm%Z% e&e'Z(G dd dZ)G dd dZ*G dd dZ+G dd dZ,G dd de+Z-G dd  d e-Z.G d!d" d"e.Z/G d#d$ d$e.Z0G d%d& d&e-Z1G d'd( d(e+e,Z2G d)d* d*e-e,Z3ed+e+d,Z4G d-d. d.e-ee4 Z5G d/d0 d0e5e4 Z6G d1d2 d2e6e4 e,Z7ee3e2f Z8G d3d4 d4e7e8 Z9G d5d6 d6e9Z:G d7d8 d8e9Z;G d9d: d:e7e9 Z<G d;d< d<e<Z=G d=d> d>e<Z>ee<d?f Z?G d@d? d?e7e? Z@G dAdB dBe@ZAG dCdD dDe@ZBG dEdF dFe5e- ZCG dGdH dHeCZDG dIdJ dJeCZEdS )K    N)DictGenericIterableIteratorListOptionalSequenceSetTupleTypeVarUnioncast   )PDFColorSpace)PDFFont)Color)PDFGraphicState)	PDFStream)INFPathSegment)LTComponentT)Matrix)Plane)Point)Rectapply_matrix_pt)bbox2str)fsplit)	get_bound)
matrix2str)uniqc                   @   s*   e Zd ZddeddfddZdd	d
ZdS )IndexAssignerr   indexreturnNc                 C   
   || _ d S Nr#   )selfr#    r)   H/home/ubuntu/webapp/venv/lib/python3.10/site-packages/pdfminer/layout.py__init__(      
zIndexAssigner.__init__objLTItemc                 C   sJ   t |tr| j|_|  jd7  _d S t |tr!|D ]	}| | qd S d S Nr   )
isinstance	LTTextBoxr#   LTTextGrouprun)r(   r-   xr)   r)   r*   r3   +   s   

zIndexAssigner.runr   )r-   r.   r$   N)__name__
__module____qualname__intr+   r3   r)   r)   r)   r*   r"   '   s    r"   c                   @   sf   e Zd ZdZ							ddededed	ed
ee dededdfddZdddZde	fddZ
dS )LAParamsa  Parameters for layout analysis

    :param line_overlap: If two characters have more overlap than this they
        are considered to be on the same line. The overlap is specified
        relative to the minimum height of both characters.
    :param char_margin: If two characters are closer together than this
        margin they are considered part of the same line. The margin is
        specified relative to the width of the character.
    :param word_margin: If two characters on the same line are further apart
        than this margin then they are considered to be two separate words, and
        an intermediate space will be added for readability. The margin is
        specified relative to the width of the character.
    :param line_margin: If two lines are are close together they are
        considered to be part of the same paragraph. The margin is
        specified relative to the height of a line.
    :param boxes_flow: Specifies how much a horizontal and vertical position
        of a text matters when determining the order of text boxes. The value
        should be within the range of -1.0 (only horizontal position
        matters) to +1.0 (only vertical position matters). You can also pass
        `None` to disable advanced layout analysis, and instead return text
        based on the position of the bottom left corner of the text box.
    :param detect_vertical: If vertical text should be considered during
        layout analysis
    :param all_texts: If layout analysis should be performed on text in
        figures.
          ?       @皙?Fline_overlapchar_marginline_marginword_margin
boxes_flowdetect_vertical	all_textsr$   Nc                 C   s6   || _ || _|| _|| _|| _|| _|| _|   d S r&   )r>   r?   r@   rA   rB   rC   rD   	_validate)r(   r>   r?   r@   rA   rB   rC   rD   r)   r)   r*   r+   P   s   
zLAParams.__init__c                 C   s\   | j d ur*d}t| j tst| j tst|d| j   kr%dks,t| t|d S d S )Nz@LAParam boxes_flow should be None, or a number between -1 and +1r   )rB   r0   r9   float	TypeError
ValueError)r(   boxes_flow_err_msgr)   r)   r*   rE   d   s   


zLAParams._validatec                 C   s   d| j | j| j| jf S )NzM<LAParams: char_margin=%.1f, line_margin=%.1f, word_margin=%.1f all_texts=%r>)r?   r@   rA   rD   r(   r)   r)   r*   __repr__p   s   zLAParams.__repr__)r;   r<   r;   r=   r;   FFr$   N)r6   r7   r8   __doc__rG   r   boolr+   rE   strrL   r)   r)   r)   r*   r:   4   s8    	

r:   c                   @   s"   e Zd ZdZdeddfddZdS )r.   z)Interface for things that can be analyzedlaparamsr$   Nc                 C      dS )zPerform the layout analysis.Nr)   r(   rQ   r)   r)   r*   analyze{      zLTItem.analyze)r6   r7   r8   rN   r:   rT   r)   r)   r)   r*   r.   x   s    r.   c                   @   s,   e Zd ZdZdefddZdefddZdS )LTTextz#Interface for things that have textr$   c                 C   s   d| j j|  f S )Nz<%s %r>)	__class__r6   get_textrK   r)   r)   r*   rL         zLTText.__repr__c                 C      t )zText contained in this objectNotImplementedErrorrK   r)   r)   r*   rX      rU   zLTText.get_textN)r6   r7   r8   rN   rP   rL   rX   r)   r)   r)   r*   rV      s    rV   c                   @   s  e Zd ZdZdeddfddZdefddZd	ede	fd
dZ
d	ede	fddZd	ede	fddZd	ede	fddZdeddfddZde	fddZdd de	fddZdd defddZdd defddZdd de	fddZdd defdd Zdd defd!d"ZdS )#LTComponentzObject with a bounding boxbboxr$   Nc                 C   s   t |  | | d S r&   )r.   r+   set_bboxr(   r^   r)   r)   r*   r+      s   
zLTComponent.__init__c                 C   s   d| j jt| jf S )Nz<%s %s>)rW   r6   r   r^   rK   r)   r)   r*   rL         zLTComponent.__repr___c                 C   rZ   r&   rI   r(   rb   r)   r)   r*   __lt__      zLTComponent.__lt__c                 C   rZ   r&   rc   rd   r)   r)   r*   __le__   rf   zLTComponent.__le__c                 C   rZ   r&   rc   rd   r)   r)   r*   __gt__   rf   zLTComponent.__gt__c                 C   rZ   r&   rc   rd   r)   r)   r*   __ge__   rf   zLTComponent.__ge__c                 C   sB   |\}}}}|| _ || _|| _|| _|| | _|| | _|| _d S r&   )x0y0x1y1widthheightr^   )r(   r^   rj   rk   rl   rm   r)   r)   r*   r_      s   


zLTComponent.set_bboxc                 C   s   | j dkp	| jdkS Nr   )rn   ro   rK   r)   r)   r*   is_empty   rY   zLTComponent.is_emptyr-   c                 C   2   t |tsJ tt||j| jko| j|jkS r&   )r0   r]   rP   typerj   rl   r(   r-   r)   r)   r*   is_hoverlap      zLTComponent.is_hoverlapc                 C   J   t |tsJ tt|| |rdS tt| j|j t| j|j S rp   	r0   r]   rP   rs   ru   minabsrj   rl   rt   r)   r)   r*   	hdistance      
"zLTComponent.hdistancec                 C   J   t |tsJ tt|| |r#tt| j|j t| j|j S dS rp   rx   rt   r)   r)   r*   hoverlap      
"zLTComponent.hoverlapc                 C   rr   r&   )r0   r]   rP   rs   rk   rm   rt   r)   r)   r*   is_voverlap   rv   zLTComponent.is_voverlapc                 C   rw   rp   	r0   r]   rP   rs   r   ry   rz   rk   rm   rt   r)   r)   r*   	vdistance   r|   zLTComponent.vdistancec                 C   r}   rp   r   rt   r)   r)   r*   voverlap   r   zLTComponent.voverlap)r6   r7   r8   rN   r   r+   rP   rL   objectrO   re   rg   rh   ri   r_   rq   ru   rG   r{   r~   r   r   r   r)   r)   r)   r*   r]      s     
r]   c                   @   s   e Zd ZdZ							ddedee dededed	ee	 d
ee	 deee
  deeeef  ddfddZdefddZdS )LTCurvez
    A generic Bezier curve

    The parameter `original_path` contains the original
    pathing information from the pdf (e.g. for reconstructing Bezier Curves).

    `dashing_style` contains the Dashing information if any.
    FN	linewidthptsstrokefillevenoddstroking_colornon_stroking_colororiginal_pathdashing_styler$   c
           
      C   sJ   t | t| || _|| _|| _|| _|| _|| _|| _	|| _
|	| _d S r&   )r]   r+   r   r   r   r   r   r   r   r   r   r   )
r(   r   r   r   r   r   r   r   r   r   r)   r)   r*   r+      s   
zLTCurve.__init__c                 C   s   d dd | jD S )N,c                 s   s    | ]}d | V  qdS )z	%.3f,%.3fNr)   ).0pr)   r)   r*   	<genexpr>   s    z"LTCurve.get_pts.<locals>.<genexpr>)joinr   rK   r)   r)   r*   get_pts   ra   zLTCurve.get_ptsFFFNNNN)r6   r7   r8   rN   rG   r   r   rO   r   r   r   r
   r   r+   rP   r   r)   r)   r)   r*   r      s>    
	

r   c                   @   sr   e Zd ZdZ							ddededededed	ed
ee dee dee	e
  deeeef  ddfddZdS )LTLinezOA single straight line.

    Could be used for separating text or figures.
    FNr   p0p1r   r   r   r   r   r   r   r$   c                 C   s$   t | |||g||||||	|

 d S r&   r   r+   )r(   r   r   r   r   r   r   r   r   r   r   r)   r)   r*   r+      s   zLTLine.__init__r   )r6   r7   r8   rN   rG   r   rO   r   r   r   r   r
   r   r+   r)   r)   r)   r*   r      s@    
	

r   c                   @   sn   e Zd ZdZ							ddededededed	ee d
ee dee	e
  deeeef  ddfddZdS )LTRectzMA rectangle.

    Could be used for framing another pictures or figures.
    FNr   r^   r   r   r   r   r   r   r   r$   c
                 C   sD   |\}
}}}t | ||
|f||f||f|
|fg|||||||	
 d S r&   r   )r(   r   r^   r   r   r   r   r   r   r   rj   rk   rl   rm   r)   r)   r*   r+      s   zLTRect.__init__r   )r6   r7   r8   rN   rG   r   rO   r   r   r   r   r
   r   r+   r)   r)   r)   r*   r     s<    	
	
r   c                   @   s8   e Zd ZdZdedededdfddZdefd	d
ZdS )LTImagezKAn image object.

    Embedded images can be in JPEG, Bitmap or JBIG2.
    namestreamr^   r$   Nc                 C   sr   t | | || _|| _|d|df| _|d| _|dd| _|d| _t	| jt
s7| jg| _d S d S )N)WWidth)HHeight)IM	ImageMask)BPCBitsPerComponentr   )CS
ColorSpace)r]   r+   r   r   get_anysrcsize	imagemaskbits
colorspacer0   list)r(   r   r   r^   r)   r)   r*   r+   A  s   zLTImage.__init__c                 C      d| j j| jt| j| jf S Nz<%s(%s) %s %r>)rW   r6   r   r   r^   r   rK   r)   r)   r*   rL   L     zLTImage.__repr__)	r6   r7   r8   rN   rP   r   r   r+   rL   r)   r)   r)   r*   r   ;  s    r   c                   @   s0   e Zd ZdZdeddfddZdefddZdS )	LTAnnoa  Actual letter in the text as a Unicode string.

    Note that, while a LTChar object has actual boundaries, LTAnno objects does
    not, as these are "virtual" characters, inserted by a layout analyzer
    according to the relationship between two characters (e.g. a space).
    textr$   Nc                 C   r%   r&   _text)r(   r   r)   r)   r*   r+   ]  s   zLTAnno.__init__c                 C      | j S r&   r   rK   r)   r)   r*   rX   a     zLTAnno.get_text)r6   r7   r8   rN   rP   r+   rX   r)   r)   r)   r*   r   U  s    r   c                   @   s   e Zd ZdZdededededededed	eee	e
e ef f d
ededdfddZdefddZdefddZdedefddZdS )LTCharz.Actual letter in the text as a Unicode string.matrixfontfontsizescalingriser   	textwidthtextdispncsgraphicstater$   Nc                 C   s~  t |  || _|| _|j| _|	| _|
| _|| | | _| rVt	|t
s'J |\}}|d u r4|d }n|| d }d| | d }| || | j f}| | || f}n| | }d|| f}| j|| | f}| j\}}}}}}d|| | k o|| dk| _t| j|\}}t| j|\}}||k r||}}||k r||}}t| ||||f | r| j| _d S | j| _d S )Nr;   gMbP?i  r   )rV   r+   r   r   fontnamer   r   advis_verticalr0   tupleget_descentuprightr   r]   rn   sizero   )r(   r   r   r   r   r   r   r   r   r   r   vxvybbox_lower_leftbbox_upper_rightdescentabcdefrj   rk   rl   rm   r)   r)   r*   r+   h  sB   



zLTChar.__init__c                 C   s,   d| j jt| jt| j| j| j|  f S )Nz(<%s %s matrix=%s font=%r adv=%s text=%r>)	rW   r6   r   r^   r    r   r   r   rX   rK   r)   r)   r*   rL     s   zLTChar.__repr__c                 C   r   r&   r   rK   r)   r)   r*   rX     r   zLTChar.get_textr-   c                 C   rR   )z<Returns True if two characters can coexist in the same line.Tr)   rt   r)   r)   r*   is_compatible  rU   zLTChar.is_compatible)r6   r7   r8   rN   r   r   rG   rP   r   r
   r   r   r   r+   rL   rX   r   rO   r   r)   r)   r)   r*   r   e  s8    	

4
r   LTItemT)boundc                   @   s|   e Zd ZdZdeddfddZdee fddZde	fd	d
Z
deddfddZdee ddfddZdeddfddZdS )LTContainerz(Object that can be extended and analyzedr^   r$   Nc                 C   s   t | | g | _d S r&   )r]   r+   _objsr`   r)   r)   r*   r+        zLTContainer.__init__c                 C   
   t | jS r&   )iterr   rK   r)   r)   r*   __iter__  r,   zLTContainer.__iter__c                 C   r   r&   )lenr   rK   r)   r)   r*   __len__  r,   zLTContainer.__len__r-   c                 C   s   | j | d S r&   )r   appendrt   r)   r)   r*   add  s   zLTContainer.addobjsc                 C   s   |D ]}|  | qd S r&   )r   )r(   r   r-   r)   r)   r*   extend  s   zLTContainer.extendrQ   c                 C   s   | j D ]}|| qd S r&   )r   rT   r(   rQ   r-   r)   r)   r*   rT     s   
zLTContainer.analyze)r6   r7   r8   rN   r   r+   r   r   r   r9   r   r   r   r   r:   rT   r)   r)   r)   r*   r     s    r   c                   @   s(   e Zd ZdddZdeddfddZdS )	LTExpandableContainerr$   Nc                 C   s    t | t
 t
 t t f d S r&   )r   r+   r   rK   r)   r)   r*   r+     s   zLTExpandableContainer.__init__r-   c                 C   sP   t | tt| | t| j|jt| j|jt| j	|j	t| j
|j
f d S r&   )r   r   r   r   r_   ry   rj   rk   maxrl   rm   rt   r)   r)   r*   r     s   zLTExpandableContainer.addrM   )r6   r7   r8   r+   r]   r   r)   r)   r)   r*   r     s    
r   c                   @   s$   e Zd ZdddZdefddZdS )LTTextContainerr$   Nc                 C   s   t |  t|  d S r&   )rV   r+   r   rK   r)   r)   r*   r+        

zLTTextContainer.__init__c                 C   s   d dd | D S )N c                 s   s(    | ]}t |trtt| V  qd S r&   )r0   rV   r   rX   r   r-   r)   r)   r*   r     s    

z+LTTextContainer.get_text.<locals>.<genexpr>)r   rK   r)   r)   r*   rX     s   
zLTTextContainer.get_textrM   )r6   r7   r8   r+   rP   rX   r)   r)   r)   r*   r     s    
r   c                       sz   e Zd ZdZdeddf fddZdefddZd	eddfd
dZ	de
e deded  fddZdef fddZ  ZS )
LTTextLinezContains a list of LTChar objects that represent a single text line.

    The characters are aligned either horizontally or vertically, depending on
    the text's writing mode.
    rA   r$   Nc                    s   t    || _d S r&   )superr+   rA   r(   rA   rW   r)   r*   r+        
zLTTextLine.__init__c                 C   s   d| j jt| j|  f S )Nz
<%s %s %r>)rW   r6   r   r^   rX   rK   r)   r)   r*   rL     s
   zLTTextLine.__repr__rQ   c                 C   s*   | j D ]}|| qt| td d S )N
)r   rT   r   r   r   r   r)   r)   r*   rT     s   
zLTTextLine.analyzeplaneratioc                 C   rZ   r&   r[   )r(   r   r   r)   r)   r*   find_neighbors
  s   zLTTextLine.find_neighborsc                    s   t   p
|   S r&   )r   rq   rX   isspacerK   r   r)   r*   rq     ra   zLTTextLine.is_empty)r6   r7   r8   rN   rG   r+   rP   rL   r:   rT   r   r   r   r   rO   rq   __classcell__r)   r)   r   r*   r     s    
r   c                       s   e Zd ZdeddfddZdeddf fddZd	ee d
ede	e
 fddZddededefddZddededefddZ	ddededefddZddededefddZ  ZS )LTTextLineHorizontalrA   r$   Nc                 C   s   t | | t
 | _d S r&   )r   r+   r   _x1r   r)   r)   r*   r+        zLTTextLineHorizontal.__init__r-   c                    s\   t |tr"| jr"| jt|j|j }| j|j| k r"t	| t
d |j| _t 	| d S N )r0   r   rA   r   rn   ro   r   rj   r   r   r   rl   r   r(   r-   marginr   r)   r*   r        zLTTextLineHorizontal.addr   r   c                    s@   |j   |jj  jj  f} fdd|D S )aK  
        Finds neighboring LTTextLineHorizontals in the plane.

        Returns a list of other LTTestLineHorizontals in the plane which are
        close to self. "Close" can be controlled by ratio. The returned objects
        will be the same height as self, and also either left-, right-, or
        centrally-aligned.
        c                    R   g | ]%}t |trj| d rj| d s%j| d s%j| d r|qS )	tolerance)r0   r   _is_same_height_as_is_left_aligned_with_is_right_aligned_with_is_centrally_aligned_withr   r   r(   r)   r*   
<listcomp>1      	z7LTTextLineHorizontal.find_neighbors.<locals>.<listcomp>)ro   findrj   rk   rl   rm   r(   r   r   r   r)   r  r*   r   $  
   
"z#LTTextLineHorizontal.find_neighborsr   otherr   c                 C      t |j| j |kS )zN
        Whether the left-hand edge of `other` is within `tolerance`.
        )rz   rj   r(   r	  r   r)   r)   r*   r   ?     z*LTTextLineHorizontal._is_left_aligned_withc                 C   r
  )zO
        Whether the right-hand edge of `other` is within `tolerance`.
        )rz   rl   r  r)   r)   r*   r  E  r  z+LTTextLineHorizontal._is_right_aligned_withc                 C   (   t |j|j d | j| j d  |kS )zQ
        Whether the horizontal center of `other` is within `tolerance`.
           )rz   rj   rl   r  r)   r)   r*   r  K     (z/LTTextLineHorizontal._is_centrally_aligned_withc                 C   r
  r&   )rz   ro   r  r)   r)   r*   r   S  rY   z'LTTextLineHorizontal._is_same_height_asr5   )r6   r7   r8   rG   r+   r]   r   r   r   r   r   r   rO   r   r  r  r   r   r)   r)   r   r*   r     s*    	

 r   c                       s   e Zd ZdeddfddZdeddf fddZd	ee d
ede	e
 fddZddededefddZddededefddZ	ddededefddZdededefddZ  ZS )LTTextLineVerticalrA   r$   Nc                 C   s   t | | t | _d S r&   )r   r+   r   _y0r   r)   r)   r*   r+   X  r   zLTTextLineVertical.__init__r-   c                    s\   t |tr"| jr"| jt|j|j }|j| | jk r"t	| t
d |j| _t 	| d S r   )r0   r   rA   r   rn   ro   rm   r  r   r   r   rk   r   r   r   r)   r*   r   _  r   zLTTextLineVertical.addr   r   c                    s@   |j   |j  jj  jf} fdd|D S )aG  
        Finds neighboring LTTextLineVerticals in the plane.

        Returns a list of other LTTextLineVerticals in the plane which are
        close to self. "Close" can be controlled by ratio. The returned objects
        will be the same width as self, and also either upper-, lower-, or
        centrally-aligned.
        c                    r   r   )r0   r  _is_same_width_as_is_lower_aligned_with_is_upper_aligned_withr  r   r  r)   r*   r  u  r  z5LTTextLineVertical.find_neighbors.<locals>.<listcomp>)rn   r  rj   rk   rl   rm   r  r)   r  r*   r   h  r  z!LTTextLineVertical.find_neighborsr   r	  r   c                 C   r
  )zJ
        Whether the lower edge of `other` is within `tolerance`.
        )rz   rk   r  r)   r)   r*   r    r  z)LTTextLineVertical._is_lower_aligned_withc                 C   r
  )zJ
        Whether the upper edge of `other` is within `tolerance`.
        )rz   rm   r  r)   r)   r*   r    r  z)LTTextLineVertical._is_upper_aligned_withc                 C   r  )zO
        Whether the vertical center of `other` is within `tolerance`.
        r  )rz   rk   rm   r  r)   r)   r*   r    r  z-LTTextLineVertical._is_centrally_aligned_withc                 C   r
  r&   )rz   rn   r  r)   r)   r*   r    rY   z$LTTextLineVertical._is_same_width_asr5   )r6   r7   r8   rG   r+   r]   r   r   r   r   r   r   rO   r  r  r  r  r   r)   r)   r   r*   r  W  s*    	

r  c                   @   s6   e Zd ZdZd
ddZdefddZdefdd	ZdS )r1   zRepresents a group of text chunks in a rectangular area.

    Note that this box is created by geometric analysis and does not
    necessarily represents a logical boundary of the text. It contains a list
    of LTTextLine objects.
    r$   Nc                 C   s   t |  d| _d S )NrF   )r   r+   r#   rK   r)   r)   r*   r+     r   zLTTextBox.__init__c                 C   s    d| j j| jt| j|  f S r   )rW   r6   r#   r   r^   rX   rK   r)   r)   r*   rL     s   zLTTextBox.__repr__c                 C   rZ   r&   r[   rK   r)   r)   r*   get_writing_mode  rf   zLTTextBox.get_writing_moderM   )r6   r7   r8   rN   r+   rP   rL   r  r)   r)   r)   r*   r1     s
    
r1   c                       4   e Zd Zdeddf fddZdefddZ  ZS )LTTextBoxHorizontalrQ   r$   Nc                    "   t  | | jjdd d d S )Nc                 S      | j  S r&   )rm   r-   r)   r)   r*   <lambda>      z-LTTextBoxHorizontal.analyze.<locals>.<lambda>keyr   rT   r   sortrS   r   r)   r*   rT        zLTTextBoxHorizontal.analyzec                 C   rR   )Nzlr-tbr)   rK   r)   r)   r*   r    rf   z$LTTextBoxHorizontal.get_writing_moder6   r7   r8   r:   rT   rP   r  r   r)   r)   r   r*   r        r  c                       r  )LTTextBoxVerticalrQ   r$   Nc                    r  )Nc                 S   r  r&   )rl   r  r)   r)   r*   r    r  z+LTTextBoxVertical.analyze.<locals>.<lambda>r  r  rS   r   r)   r*   rT     r!  zLTTextBoxVertical.analyzec                 C   rR   )Nztb-rlr)   rK   r)   r)   r*   r    rf   z"LTTextBoxVertical.get_writing_moder"  r)   r)   r   r*   r$    r#  r$  r2   c                       s*   e Zd Zdee ddf fddZ  ZS )r2   r   r$   Nc                    s   t    | | d S r&   )r   r+   r   )r(   r   r   r)   r*   r+     r   zLTTextGroup.__init__)r6   r7   r8   r   TextGroupElementr+   r   r)   r)   r   r*   r2     s    "c                       &   e Zd Zdeddf fddZ  ZS )LTTextGroupLRTBrQ   r$   Nc                    :   t  | |jd usJ |j | jj fddd d S )Nc                    s"   d  | j  d  | j| j   S r/   )rj   rk   rm   r  rB   r)   r*   r    s    z)LTTextGroupLRTB.analyze.<locals>.<lambda>r  r   rT   rB   r   r   rS   r   r)  r*   rT        
zLTTextGroupLRTB.analyzer6   r7   r8   r:   rT   r   r)   r)   r   r*   r'        r'  c                       r&  )LTTextGroupTBRLrQ   r$   Nc                    r(  )Nc                    s$   d   | j | j  d  | j  S r/   )rj   rl   rm   r  r)  r)   r*   r    s    z)LTTextGroupTBRL.analyze.<locals>.<lambda>r  r*  rS   r   r)  r*   rT     r+  zLTTextGroupTBRL.analyzer,  r)   r)   r   r*   r.    r-  r.  c                   @   s   e Zd ZdeddfddZdedee dee	 fdd	Z
ded
ee	 dee fddZdedee dee fddZdeddfddZdS )LTLayoutContainerr^   r$   Nc                 C   s   t | | d | _d S r&   )r   r+   groupsr`   r)   r)   r*   r+     r   zLTLayoutContainer.__init__rQ   r   c                 c   s   d }d }|D ]}|d ur| |o4||o4t|j|j|j ||k o4||t|j|j|j	 k }|j
o_| |o_||o_t|j|j|j ||k o_||t|j|j|j	 k }|rgt|tsn|rtt|trt|| nA|d ur~|V  d }n7|r|st|j}|| || n#|r|st|j}|| || nt|j}|| |V  d }|}q|d u rt|j}|d usJ || |V  d S r&   )r   r   ry   ro   r>   r   r{   r   rn   r?   rC   ru   r~   r   r0   r   r  r   rA   )r(   rQ   r   obj0lineobj1halignvalignr)   r)   r*   group_objects  sr   









zLTLayoutContainer.group_objectslinesc                 c   s    t | j}|| i }|D ]>}|||j}|g}|D ]}|| ||v r0||| qt|tr:t	 }	nt
 }	t|D ]}
|	|
 |	||
< qAqt }|D ]}||vrZqS|| }	|	|v rcqS||	 |	 so|	V  qSdS )z$Group neighboring lines to textboxesN)r   r^   r   r   r@   r   popr0   r   r  r$  r!   r   setrq   )r(   rQ   r7  r   boxesr2  	neighborsmembersr3  boxr-   doner)   r)   r*   group_textlinesC  s>   






z!LTLayoutContainer.group_textlinesr:  c              
      s  t ttf }t| j dtdtdtfdd}d|d|dt| f fdd}g }tt	|D ](}|| }t|d t	|D ]}	||	 }
|
d	|||
t|t|
||
f q<q-t|  | t }t	|d
krt|\}}}}}}||vr||vr|s|||rt|d|||||f qct|ttfst|ttfrt||g}nt||g} |  | |||g  D ]}t|d	|||t|t|||f q | t	|d
ksitdd  D S )ax  Group textboxes hierarchically.

        Get pair-wise distances, via dist func defined below, and then merge
        from the closest textbox pair. Once obj1 and obj2 are merged /
        grouped, the resulting group is considered as a new object, and its
        distances to other objects & groups are added to the process queue.

        For performance reason, pair-wise distances and object pair info are
        maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)
        tuples. It ensures quick access to the smallest element. Note that
        since comparison operators, e.g., __lt__, are disabled for
        LTComponent, id(obj) has to appear before obj in element tuples.

        :param laparams: LAParams object.
        :param boxes: All textbox objects to be grouped.
        :return: a list that has only one element, the final top level group.
        r3  obj2r$   c                 S   s`   t | j|j}t | j|j}t| j|j}t| j|j}|| ||  | j| j  |j|j  S )a  A distance function between two TextBoxes.

            Consider the bounding rectangle for obj1 and obj2.
            Return its area less the areas of obj1 and obj2,
            shown as 'www' below. This value may be negative.
                    +------+..........+ (x1, y1)
                    | obj1 |wwwwwwwwww:
                    +------+www+------+
                    :wwwwwwwwww| obj2 |
            (x0, y0) +..........+------+
            )ry   rj   rk   r   rl   rm   rn   ro   )r3  r@  rj   rk   rl   rm   r)   r)   r*   dist|  s   

z/LTLayoutContainer.group_textboxes.<locals>.distc                    s\   t | j|j}t | j|j}t| j|j}t| j|j}t ||||f}|| |fS )z8Check if there's any other object between obj1 and obj2.)	ry   rj   rk   r   rl   rm   r9  r  
difference)r3  r@  rj   rk   rl   rm   r   r   r)   r*   isany  s   z0LTLayoutContainer.group_textboxes.<locals>.isanyr   Fr   Tc                 s   s    | ]}t t|V  qd S r&   )r   r2   )r   gr)   r)   r*   r     s    z4LTLayoutContainer.group_textboxes.<locals>.<genexpr>)r   r1   r2   r   r^   r]   rG   r	   ranger   r   idheapqheapifyr   r9  heappopheappushr0   r$  r.  r'  removeupdater   r   )r(   rQ   r:  ElementTrA  rD  distsibox1jbox2r>  
skip_isanyr   id1id2r3  r@  groupr	  r)   rC  r*   group_textboxesd  sH   
	&




z!LTLayoutContainer.group_textboxesc                 C   s&  t dd | \}}|D ]}|| q|sd S t| ||}t dd |\}}|D ]}|| q*t| ||}|jd u r^|D ]}|| qAdtdttt	t	f fdd}	|j
|	d n"| ||| _t }
| jD ]}|| |
| qk|j
d	d d ttt || ttt | | _d S )
Nc                 S   s
   t | tS r&   )r0   r   r  r)   r)   r*   r    s   
 z+LTLayoutContainer.analyze.<locals>.<lambda>c                 S   s   |   S r&   )rq   r  r)   r)   r*   r    r  r=  r$   c                 S   s,   t | trd| j | j fS d| j | jfS )Nr   r   )r0   r$  rl   rk   rj   r=  r)   r)   r*   getkey  s   
z)LTLayoutContainer.analyze.<locals>.getkeyr  c                 S   r   r&   r'   rY  r)   r)   r*   r    s    )r   rT   r   r6  r?  rB   r1   r
   r9   rG   r   rX  r0  r"   r3   r   r   r]   r   )r(   rQ   textobjs	otherobjsr-   	textlinesempties	textboxestextboxrZ  assignerrW  r)   r)   r*   rT     s8   


zLTLayoutContainer.analyze)r6   r7   r8   r   r+   r:   r   r]   r   r   r6  r1   r?  r   r   r2   rX  rT   r)   r)   r)   r*   r/    s0    
P
!
[r/  c                   @   sJ   e Zd ZdZdedededdfddZdefd	d
Zde	ddfddZ
dS )LTFigurezRepresents an area used by PDF Form objects.

    PDF Forms can be used to present figures or pictures by embedding yet
    another PDF document within a page. Note that LTFigure objects can appear
    recursively.
    r   r^   r   r$   Nc           	         sj   || _  | _|\}}}}||f|| |f||| f|| || ff}t fdd|D }t| | d S )Nc                 3   s"    | ]\}}t  ||fV  qd S r&   r   )r   r   qr   r)   r*   r     s     z$LTFigure.__init__.<locals>.<genexpr>)r   r   r   r/  r+   )	r(   r   r^   r   r4   ywhboundsr)   rd  r*   r+     s   ,zLTFigure.__init__c                 C   s"   d| j j| jt| jt| jf S )Nz<%s(%s) %s matrix=%s>)rW   r6   r   r   r^   r    r   rK   r)   r)   r*   rL     s   zLTFigure.__repr__rQ   c                 C   s   |j sd S t| | d S r&   )rD   r/  rT   rS   r)   r)   r*   rT     s   zLTFigure.analyze)r6   r7   r8   rN   rP   r   r   r+   rL   r:   rT   r)   r)   r)   r*   rb    s
    	rb  c                	   @   s:   e Zd ZdZddedededdfdd	Zdefd
dZ	dS )LTPagezRepresents an entire page.

    Like any other LTLayoutContainer, an LTPage can be iterated to obtain child
    objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine.
    r   pageidr^   rotater$   Nc                 C   s   t | | || _|| _d S r&   )r/  r+   rj  rk  )r(   rj  r^   rk  r)   r)   r*   r+     s   zLTPage.__init__c                 C   r   )Nz<%s(%r) %s rotate=%r>)rW   r6   rj  r   r^   rk  rK   r)   r)   r*   rL     r   zLTPage.__repr__r5   )
r6   r7   r8   rN   r9   r   rG   r+   rP   rL   r)   r)   r)   r*   ri    s    ri  )FrH  loggingtypingr   r   r   r   r   r   r   r	   r
   r   r   r   pdfcolorr   pdffontr   	pdfinterpr   r   pdftypesr   utilsr   r   r   r   r   r   r   r   r   r   r   r    r!   	getLoggerr6   loggerr"   r:   r.   rV   r]   r   r   r   r   r   r   r   r   r   r   TextLineElementr   r   r  r1   r  r$  r%  r2   r'  r.  r/  rb  ri  r)   r)   r)   r*   <module>   sf    8
DI%!!I"DD

 { 