o
    ݉f                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZ ddl	m
Z
 dd Zdd Zd5ddZdd Zdd Zd6ddZdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Z e!d4kre   dS dS )7    N)DictListSet   )pymupdfc                 C   s   d|   ddS )Nz %s K   -)center)x r   H/home/ubuntu/flask/venv/lib/python3.10/site-packages/pymupdf/__main__.pymycenter   s   r   c                 C   s   |d }|d }|dkr|  |S dd }t| |}t| |}	 |j|jkr;|j|j  kr4dkr;n n|jdksOtd||f  t| d}||S t|}||j d }}||S )zReturn image for a given XREF.r   r   c                 S   s"   | j jdkr| S ttj| }|S )N   )
colorspacenr   PixmapcsRGB)pixtpixr   r   r   getimage   s   zrecoverpix.<locals>.getimagez&Warning: unsupported /SMask %i for %i:N)	extract_imager   r   irectalphar   message	set_alphasamples)docitemr
   sr   pix1pix2r   r   r   r   
recoverpix   s"   
2

r!   FTc                 C   s   t | }|js|du rtd d}|js|S |r9||}|s&td |du r7t |dkr4dd nd |S td	|j  |S )
z!Open and authenticate a document.Tz$this command supports PDF files onlyzauthentication unsuccessful   zauthenticated as %sowneruserz'%s' requires a password)	r   openis_pdfsysexit
needs_passauthenticater   name)filenamepasswordshowpdfr   rcr   r   r   	open_file9   s   



r2   c                 C   sL   t dd |  D d }|  D ]\}}d|||f }t| qdS )zPrint a Python dictionary.c                 S   s   g | ]}t |qS r   )len).0kr   r   r   
<listcomp>N       zprint_dict.<locals>.<listcomp>r   z%s: %sN)maxkeysitemsrjustr   r   )r   lr5   vmsgr   r   r   
print_dictL   s
   r?   c                 C   s   t d|  | |}t | | |rF| }z|dd }|| }|dr-d}W n ty9   d}Y nw t d|  t d t d d	S )
zPrint an object given by XREF number.

    Simulate the PDF source in "pretty" format.
    For a stream also print its size.
    z%i 0 objz/Lengthr   z0 Runknownzstream
...%s bytes	endstreamendobjN)r   r   xref_objectxref_is_streamsplitindexendswith	Exception)r   xrefxref_strtempidxsizer   r   r   
print_xrefT   s"   




rN   pagec              	   C   sl  t |d }| d|dd} | d}g }t|D ]\}}|d }| rIt|}	d|	  kr5|k r?n n|t| n	td||f  qz|d\}
}t|
}
t|}W n t	yl   td||f  Y nw d|
  krw|k rn nd|  kr|k sn td||f  |
|kr||
 q|
|k r|t
t|
|d 7 }q|t
t|
|d d	7 }q|S )
aK  Transform a page / xref specification into a list of integers.

    Args
    ----
        rlist: (str) the specification
        limit: maximum number, i.e. number of pages, number of objects
        what: a string to be used in error messages
    Returns
    -------
        A list of integers representing the specification.
    r   N  ,zbad %s specification at item %ir   z%bad %s range specification at item %ir"   )strreplacerE   	enumerate	isdecimalintappendr(   r)   rH   listrange)rlistlimitwhatrP   	rlist_arrout_listseqr   r   ii1i2r   r   r   get_listk   s8   
.
re   c                 C   s  t | j| jd}tj| jd }d}|dkr|d }d}t|d}|j}t	d| j|j
| d |||d |d	 f  |j}|d
krW| }t	d||dkrRdndf  | }|d
krft	d|  t	  | jrt	td | }t|| t	  | jrt	td t|j t	  | jrt	td t| j| dd}|D ]}t|| t	  q| jrt	td t| j|j
d }	|	D ]}
|
d }||}t	d|
  t|| t	  q| jrt	td t	|  t	  |  d S )NTi   KBi  MBr   z7'%s', pages: %i, objects: %i, %g %s, %s, encryption: %sformat
encryptionr   z5document contains %i root form fields and is %ssigned   znot rR   z#document contains %i embedded fileszPDF catalogzPDF metadatazobject informationrI   )r^   zpage informationzPage %i:zPDF trailer)r2   inputr.   ospathgetsizeroundmetadatar   r   
page_countxref_lengthis_form_pdfget_sigflagsembfile_countcatalogr   pdf_catalogrN   r?   xrefsre   pages	page_xreftrailerpdf_trailerclose)argsr   rM   flagmetar   r   rI   xreflpagelpnor   r   r   r/      sz   








r/   c                 C   s   t | j| jdd}| j}d|}| js0|j| j| j| j	| j
| j| j| j|| j| j| jd d S t| j|jd }t }|D ]}|d }|j|||d q?|j| j| j| j	| j
| j| j| j|| j| j| jd |  |  d S )NTr0   keepnonezrc4-40zrc4-128zaes-128zaes-256)
garbagedeflateprettycleanasciilinearri   owner_pwuser_pwpermissionsr   	from_pageto_page)r2   rk   r.   ri   rF   ry   saveoutputr   compressr   sanitizer   r   r$   r%   
permissionre   rq   r   r&   
insert_pdfr}   )r~   r   ri   encryptry   outdocr   r   r   r   r   r      sR   r   c           
      C   s   | j }t }|D ]T}|d}t|dkr|d nd}t|d |dd}d|dd }|r@td|dd |jd }nt	d|jd }|D ]}	|j
||	d |	d d qJ|  q	|j| jd	dd
 |  dS )z&Join pages from several PDF documents.rS   r   Nr   Tr   r#   r   r   )r   r   )rk   r   r&   rE   r3   r2   joinre   rq   r[   r   r}   r   r   )
r~   doc_listr   src_itemsrc_listr.   srcry   	page_listrb   r   r   r   doc_join  s   
 
r   c           	      C   sH  t | j| jdd}| s| jr| j| jkrtd t | j| j}| j	r*t
| j	nt
 }t
| }|r?||ks>td n|}|sHtd |t
| @ }|r[tdt|  |D ]&}||}||}|j|||d |d |d	 d
 td||j	f  q]|  | jr| j| jkr|j| jdd n|  |  dS )z!Copy embedded files between PDFs.Tr   cannot save PDF incrementallyz%not all names are contained in sourceznothing to copyz2following names already exist in receiving PDF: %sr-   	ufilenamedescr-   r   r   zcopied entry '%s' from '%s'rj   r   N)r2   rk   r.   can_save_incrementallyr   r(   r)   source	pwdsourcer,   setembfile_namesrT   embfile_infoembfile_getembfile_addr   r   r}   r   saveIncr)	r~   r   r   names	src_names	intersectr   infobuffr   r   r   embedded_copy  sF   




r   c              
   C   s   t | j| jdd}| s| jr| j| jkrtd ttj	j
f}tjdk r(t}z|| j W n |yN } ztd| jd|  W Y d}~nd}~ww | jrX| j| jkr]|  n|j| jdd	 |  dS )
zDelete an embedded file entry.Tr   r   r      no such embedded file : Nr   r   )r2   rk   r.   r   r   r(   r)   
ValueErrorr   mupdfFzErrorBasemupdf_version_tupleembfile_delr,   r   r   r}   )r~   r   exception_typeser   r   r   embedded_delE  s&   

$
r   c              
   C   s   t | j| jdd}ttjjf}tjdk rt}z|| j	}|
| j	}W n |yB } ztd| j	d|  W Y d}~nd}~ww | jrI| jn|d }t|d}|| W d   n1 sbw   Y  td	| j	|f  |  dS )
z&Retrieve contents of an embedded file.Tr   r   r   r   Nr-   wbzsaved entry '%s' as '%s')r2   rk   r.   r   r   r   r   r   r   r,   r   r(   r)   r   r&   writer   r}   )r~   r   r   streamdr   r-   r   r   r   r   embedded_get[  s"   
$r   c                 C   s8  t | j| jdd}| s| jdu s| j| jkrtd z|| j td| j  W n	 t	y6   Y nw t
j| jrEt
j| jsMtd| j  t| jd}| }W d   n1 sbw   Y  | j}|}| jsr|}n| j}|j| j||||d | jr| j| jkr|  n|j| jd	d
 |  dS )zInsert a new embedded file.Tr   Nr   zentry '%s' already existszno such file '%s'rbr   rj   r   )r2   rk   r.   r   r   r(   r)   r   r,   rH   rl   rm   existsisfiler&   readr   r   r   r   r}   r~   r   fr   r-   r   r   r   r   r   embedded_addm  s6   


r   c                 C   sd  t | j| jdd}| s| jdu s| j| jkrtd z|| j W n t	y6   td| j  Y nw | j
duretj
| j
retj
| j
ret| j
d}| }W d   n1 s_w   Y  nd}| jrn| j}nd}| jrw| j}n	| jr~| j}nd}| jr| j}nd}|j| j||||d | jdu s| j| jkr|  n|j| jdd	 |  dS )
z0Update contents or metadata of an embedded file.Tr   Nr   no such embedded file '%s'r   r   rj   r   )r2   rk   r.   r   r   r(   r)   r   r,   rH   rm   rl   r   r   r&   r   r-   r   r   embfile_updr   r   r}   r   r   r   r   embedded_upd  sJ   



r   c                 C   s.  t | j| jdd}| }| jdurI| j|vr td| j  n)t  tdt	|t	|dkr2dndf  t  t
|| j t  dS |sUtd	|j  dS t	|dkred
|jt	|f }nd|j }t| t  |D ]}| jst| qu||}t
|| t  qu|  dS )zList embedded files.Tr   Nr   z!printing 1 of %i embedded file%s:r   r   rR   z'%s' contains no embedded filesz-'%s' contains the following %i embedded filesz)'%s' contains the following embedded file)r2   rk   r.   r   r,   r(   r)   r   r   r3   r?   r   detailr}   )r~   r   r   r>   r,   _r   r   r   embedded_list  s>   






r   c              
   C   s  | j s| jstd t| j| jdd}| jr!t| j|j	d }nt
d|j	d }| js4tjtj}n| j}tj|rCtj|sJtd|  t }t }|D ]}| j r||d }|D ]N}|d }	|	|vr||	 ||	\}
}}}|dks~|sq`tj||
dd	 d	|	 d
| }t|d}|| W d   n1 sw   Y  d}q`| jr&||d }|D ]i}|d }	|	|vr%||	 t||}t|tu r|d }|d }tj|d|	|f }t|d}|| W d   n	1 sw   Y  qtj|d|	 }|jjdk r|nt !t j"|}|#| qqR| j r6t $dt%||f  | jrEt $dt%||f  |&  dS )z)Extract images and / or fonts from a PDF.z"neither fonts nor images requestedTr   r   z"output directory %s does not existr   zn/arQ   r   .r   Nextimagez	img-%i.%sz
img-%i.pngr   zsaved %i fonts to '%s'zsaved %i images to '%s')'fontsimagesr(   r)   r2   rk   r.   ry   re   rq   r[   r   rl   rm   abspathcurdirr   isdirr   get_page_fontsaddextract_fontr   rU   r&   r   get_page_imagesr!   typedictr   r   r   r   r   r   r   r3   r}   )r~   r   ry   out_dir
font_xrefsimage_xrefsr   itemlistr   rI   fontnamer   r   bufferoutnameoutfiler   imgdatar    r   r   r   extract_objects  st   





r   c           	      C   sX   |rdnt dg}| jd|d}|s|s|| d S ||jddd || d S )N   
   textflagsutf8surrogatepasserrors)bytesget_textr   encode)	rO   textoutGRIDfontsize
noformfeed
skip_emptyr   eopr   r   r   r   page_simple'  s   

r   c           
      C   sz   |rdnt dg}| jd|d}|g kr|s|| d S |jdd d |D ]}	||	d jd	d
d q'|| d S )Nr   r   blocksr   c                 S   s   | d | d fS )Nrj   r   r   )br   r   r   <lambda>:  s    z page_blocksort.<locals>.<lambda>keyr   r   r   r   )r   r   r   sortr   )
rO   r   r   r   r   r   r   r   r   r  r   r   r   page_blocksort3  s   

r  c           !         s@  |rdnt dg}dtt dtdtfdd}dtt dtfd	d
}	dtt dtjf fdd}
dtdtfdddd }| jd|dd }|
|| \}}}}}|g kr]|s[|	| d S |	||}|j
dd d i }|D ]}|\}}}}|||}||g }|| |||< qnt| }|
  || }i }|D ]/}|| }t|}|dk rd||< qdd |D }|
  t|}||k r|}|d ||< q||d |d   |t|  d }|d }|	d |D ].}||k r|	d ||7 }||k s||||| || } |	| d  jd!d"d# || }q|	| d S )$Nr   r   valuesvaluereturnc                 S   s,   t | |}|r| |d  S td|| f )zFind the right row coordinate.

        Args:
            values: (list) y-coordinates of rows.
            value: (int) lookup for this value (y-origin of char).
        Returns:
            y-ccordinate of appropriate line for value.
        r   zLine for %g not found in %s)bisectbisect_rightRuntimeError)r  r  rb   r   r   r   find_line_indexE  s   	z$page_layout.<locals>.find_line_indexrowsc                 S   sJ   t | } |   | d g}| dd  D ]}||d | kr"|| q|S )Nr   r   r"   )rZ   r  rY   )r  r   nrowshr   r   r   curate_rowsT  s   

z page_layout.<locals>.curate_rowsr   rO   c              
      s  t  }|jj}|jj}|}|}d}g }| D ]}	|	d D ]}
|
d dkr$q|
d \}}}}|dk s6||jjkr7q|| }||krA|}|
d D ]}|d  krNqE|d D ]z}|d \}}}}|| }|d	 \}}tt|}|| |d
 }||kr|dkr|}||k r|}|dkr|g kr|d \}}}}||kr|tdkr|| }n|dkrtd}n|dkrtd}n|}||||f|d< qR|||||f qRqEqq|||||fS )Nr   linesdir)r   r   bboxspansrM   charsorigincrQ   r"      rb     r<     )	r   rectwidthheightrX   ro   r   chrrY   )r   rO   r  
page_widthpage_height	rowheightleftrightr  blocklinex0y0x1y1r  spanr  r   cwidthoxoychold_chold_oxold_oy
old_cwidthligr   joinligaturer   r   process_blocks]  s`   


)z#page_layout.<locals>.process_blocksr4  c                 S   st   | dkrt dS | dkrt dS | dkrt dS | dkr t dS | d	kr(t d
S | dkr0t dS | dkr8t dS | S )zReturn ligature character for a given pair / triple of characters.

        Args:
            lig: (str) 2/3 characters, e.g. "ff"
        Returns:
            Ligature, e.g. "ff" -> chr(0xFB00)
        ffr  fii  fli  ffir  fflr  fti  sti  )r  )r4  r   r   r   r6    s   	z!page_layout.<locals>.joinligaturec                 S   s   d}d}d}d}|t jkrtd| |D ]`}|\}	}
}}|
|  }
|
| }||	kr2|
| |d kr2q|	dkr?||
 | dkr?q|	}|
|| k rP||	7 }|}|
}q|	dkrUqt|
| t| }|
|krm|dkrm|d| 7 }||	7 }|}|
}q| S )a  Produce the text of one output line.

        Args:
            left: (float) left most coordinate used on page
            slot: (float) avg width of one character in any font in use.
            minslot: (float) min width for the characters in this line.
            chars: (list[tuple]) characters of this line.
        Returns:
            text: (str) text string for this line
        rR   r   z%program error: minslot too small = %gg?rQ   g?r   )r   EPSILONr  rX   r3   rstrip)r#  slotminslotlcharsr   old_charold_x1r1  r  charr-  r   r,  r)  deltar   r   r   make_textline  s:   
z"page_layout.<locals>.make_textlinerawdictr   c                 S   s   | d S )Nr   r   )r  r   r   r   r    s    zpage_layout.<locals>.<lambda>r  r#   r   c                 S   s   g | ]}|d  qS )rj   r   )r4   r  r   r   r   r6     r7   zpage_layout.<locals>.<listcomp>r   r"   g333333?
r   r   r   )r   r   rX   r   r   r   PagerT   r   r   r  getrY   rZ   r9   r3   
statisticsmedianr   )!rO   r   r   r   r   r   r   r   r  r  r7  rH  r   r  r  r#  r$  r"  r  r  r   r.  yrC  r9   rA  minslotsr5   ccountwidths	this_slotrowposr   r   r5  r   page_layoutA  sb   	46






$


rU  c                 C   s   t | j| jdd}t| j|jd }| j}|d u r&tj	|j
\}}|d }t|dJ}tjtjB }| jr:|tjN }| jrB|tjN }| jrJ|tjN }tttd}|D ]}	||	d  }
|| j |
|| j| j| j| j|d qRW d    d S 1 sxw   Y  d S )NFr   r   z.txtr   simpler   layoutr   )r2   rk   r.   re   ry   rq   r   rl   rm   splitextr,   r&   r   TEXT_PRESERVE_LIGATURESTEXT_PRESERVE_WHITESPACEconvert_whitenoligaturesextra_spacesTEXT_INHIBIT_SPACESr   r  rU  modegridr   r   r   )r~   r   r   r   r-   r   r   r   funcr   rO   r   r   r   gettext"  s>   


"rc  c                 C   s   t d t d d S )NzThis is from PyMuPDF message().zThis is from PyMuPDF log().)r   r   log)r~   r   r   r   	_internalC  s   
re  c                  C   s:  t jdtdd} | jddd}|jdtdd	}|jd
tdd |jddd |jdddd |jdddd |jdddd |jdtdd |jdtdd |jtd |jdtdd	}|jd
tdd |jdtd d |jddd |jd!d"d#d$d% |jd&td'd |jd(td)d |jd*t	d+t
d,d-d. |jd/dd0d1d2 |jd3dd0d4d2 |jd5dd0d6d2 |jd7t	d8d9d: |jd;dd0d<d2 |jd=dd0d>d2 |jdd?d |jtd |jd@tdAdBdC}|jd
dDdEdF |jdGdHdIdJ |jtd |jdKtdLd	}|jd
tdd |jdMddNd |jdOddPd |jdGdQd |jddd |jdtdRd |jtd |jdStdTd	}|jd
dd |jdUdVd |jdWddXd |jddd |jtd |jdYtdZd	}|jd
dd |jddd |jdGd[d |jdUdHd\dJ |jd]dHd^dJ |jd_d`d |jtd |jdatdbd	}|jd
dd |jddd |jdGd[d |jdUdHdcdJ |jtd |jddtdedfdC}|jd
dd |jdUdHdgdJ |jddd |jdGdhd |jd]did |jdjdkd |jdldmd |jd_dnd |jtd |jdotdpd	}	|	jd
tdd |	jdUdHdgdJ |	jddd |	jdGdqd |	jtd |jdrtdsd	}
|
jd
tdtd |
jddud |
jdGdvd |
jdwdHdxdJ |
jdydzd |
jdUdDd{dF |
jtd |jd|td}d	}|jd
td~d |jddd |jdtdddd. |jdtddd |jdddd0d |jdddd0d |jdddd0d |jdddd0d |jdddd0d |jdGdd |jdtddd |jdtddd |jtd |jdtdd	}|jtd |  }t|ds|   dS || dS )zDefine command configurations.r   zBasic PyMuPDF Functions)progdescriptionSubcommandsz/Enter 'command -h' for subcommand specific help)titlehelpr/   zdisplay PDF information)rg  rk   zPDF filename)r   rj  z	-passwordr.   )rj  z-catalog
store_truezshow PDF catalog)actionrj  z-trailerzshow PDF trailerz	-metadatazshow PDF metadataz-xrefsz&show selected objects, format: 1,5-7,Nz-pagesz'show selected pages, format: 1,5-7,50-N)rb  r   z.optimize PDF, or create sub-PDF if pages givenr   zoutput PDF filenamez-encryptionzencryption methodr   r   )rj  choicesdefaultz-ownerzowner passwordz-userzuser passwordz-garbagezgarbage collection level   r   )r   rj  rm  rn  z	-compressFzcompress (deflate) output)rl  rn  rj  z-asciizASCII encode binary dataz-linearzformat for fast web displayz-permissionr"   zinteger with permission levels)r   rn  rj  z	-sanitizezsanitize / clean contentsz-prettyzprettify PDF structurez/output selected pages pages, format: 1,5-7,50-Nr   zjoin PDF documentsz3specify each input as 'filename[,password[,pages]]')rg  epilog*zinput filenames)nargsrj  z-outputTzoutput filename)requiredrj  extractz extract images and fonts to diskz-imageszextract imagesz-fontszextract fontsz-folder to receive output, defaults to currentz-consider these pages only, format: 1,5-7,50-Nz
embed-infozlist embedded filesz-namezif given, report only this onez-detailzdetail informationz	embed-addzadd embedded filez-output PDF filename, incremental save if nonezname of new entryz-pathzpath to data for new entryz-desczdescription of new entryz	embed-delzdelete embedded filezname of entry to deletez	embed-updzupdate embedded filez*except '-name' all parameters are optionalzname of entryz-Output PDF filename, incremental save if nonezpath to new data for entryz	-filenameznew filename to store in entryz
-ufilenamez&new unicode filename to store in entryz!new description to store in entryzembed-extractzextract embedded file to diskz'output filename, default is stored namez
embed-copyz copy embedded files between PDFszPDF to receive embedded fileszpassword of inputz2output PDF, incremental save to 'input' if omittedz-sourcezcopy embedded files from herez
-pwdsourcezpassword of 'source' PDFzrestrict copy to these entriesrc  z(extract text in various formatting modeszinput document filenamezpassword for input documentz-modez-mode: simple, block sort, or layout (default)rV  rX  z select pages, format: 1,5-7,50-Nz1-N)r   rj  rn  z-noligaturesz*expand ligature characters (default False))rl  rj  rn  z-convert-whitez6convert whitespace characters to white (default False)z-extra-spacesz%fill gaps with spaces (default False)z-noformfeedz-write linefeeds, no formfeeds (default False)z-skip-emptyz+suppress pages with no text (default False)z3store text in this file (default inputfilename.txt)z-gridz+merge lines if closer than this (default 2)r#   z	-fontsizez4only include text with a larger fontsize (default 3)rj   internalzinternal testingrb  N)argparseArgumentParserr   add_subparsers
add_parseradd_argumentrT   set_defaultsr/   rX   r[   r   r   r   r   r   r   r   r   r   floatrc  re  
parse_argshasattr
print_helprb  )parsersubpsps_showps_cleanps_join
ps_extractps_embed_addps_embed_delps_embed_updps_embed_extractps_embed_copy
ps_gettextps_internalr~   r   r   r   mainG  s  r  __main__)FT)rO   )"rv  r
  rl   r(   rM  typingr   r   r   rR   r   r   r!   r2   r?   rN   re   r/   r   r   r   r   r   r   r   r   r   r   r  rU  rc  re  r  __name__r   r   r   r   <module>   sF   
$
/?/("2$B b!  4
