Use Case - PDF to HTML#

对 PDF 文档元素进行分析非常困难, 但如果将它转换为 HTML 就简单多了. 因为 HTML 是一个完备的, 树形结构的标记语言, 它可以很好地表示文档的结构. 本文研究如何使用 pymupdf 将 PDF 文档转换为 HTML.

 1# -*- coding: utf-8 -*-
 2
 3import fitz
 4from pathlib import Path
 5
 6def pdf_to_html(path_pdf: Path) -> Path:
 7	path_html = dir_here.joinpath(path_pdf.stem + ".html")
 8	doc = fitz.open(str(path_pdf))
 9	out = open(str(path_html), "wb") # create a text output
10	for page in doc: # iterate the document pages
11		text = page.get_text("xhtml").encode("utf8") # get plain text (is in UTF-8)
12		out.write(text) # write text of page
13		out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
14	out.close()
15
16
17if __name__ == "__main__":
18	dir_here = Path(__file__).absolute().parent
19
20	# path_pdf = dir_here / "Documentation-Best-Practices.pdf"
21	path_pdf = dir_here / "PDF-to-HTML-Test.pdf"
22	path_pdf = dir_here / "PyMuPDF-The-Basics.pdf"
23
24	pdf_to_html(path_pdf)