Use Case - PDF to HTML#
对 PDF 文档元素进行分析非常困难, 但如果将它转换为 HTML 就简单多了. 因为 HTML 是一个完备的, 树形结构的标记语言, 它可以很好地表示文档的结构. 本文研究如何使用 pymupdf
将 PDF 文档转换为 HTML.
1# -*- coding: utf-8 -*-
2
3import fitz
4from pathlib import Path
5
6def pdf_to_html(path_pdf: Path) -> Path:
7 path_html = dir_here.joinpath(path_pdf.stem + ".html")
8 doc = fitz.open(str(path_pdf))
9 out = open(str(path_html), "wb") # create a text output
10 for page in doc: # iterate the document pages
11 text = page.get_text("xhtml").encode("utf8") # get plain text (is in UTF-8)
12 out.write(text) # write text of page
13 out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
14 out.close()
15
16
17if __name__ == "__main__":
18 dir_here = Path(__file__).absolute().parent
19
20 # path_pdf = dir_here / "Documentation-Best-Practices.pdf"
21 path_pdf = dir_here / "PDF-to-HTML-Test.pdf"
22 path_pdf = dir_here / "PyMuPDF-The-Basics.pdf"
23
24 pdf_to_html(path_pdf)