Upload 14 files
Browse files- README.md +2 -2
- config.json +2 -2
- metadata.json +12 -0
- pipeline.py +79 -0
README.md
CHANGED
@@ -4,10 +4,10 @@ language:
|
|
4 |
- en
|
5 |
license: apache-2.0
|
6 |
library_name: transformers
|
7 |
-
pipeline_tag:
|
8 |
tags:
|
9 |
- pdf-to-markdown
|
10 |
-
-
|
11 |
---
|
12 |
|
13 |
# MinerU PDF to Markdown Model
|
|
|
4 |
- en
|
5 |
license: apache-2.0
|
6 |
library_name: transformers
|
7 |
+
pipeline_tag: feature-extraction
|
8 |
tags:
|
9 |
- pdf-to-markdown
|
10 |
+
- feature-extraction
|
11 |
---
|
12 |
|
13 |
# MinerU PDF to Markdown Model
|
config.json
CHANGED
@@ -2,8 +2,8 @@
|
|
2 |
"architectures": ["MinerUModel"],
|
3 |
"model_type": "mineru",
|
4 |
"framework": "pytorch",
|
5 |
-
"task": "
|
6 |
-
"pipeline_tag": "
|
7 |
"model_name_or_path": "kitjesen/MinerU",
|
8 |
"auto_map": {
|
9 |
"AutoModel": "modeling.MinerUModel",
|
|
|
2 |
"architectures": ["MinerUModel"],
|
3 |
"model_type": "mineru",
|
4 |
"framework": "pytorch",
|
5 |
+
"task": "feature-extraction",
|
6 |
+
"pipeline_tag": "feature-extraction",
|
7 |
"model_name_or_path": "kitjesen/MinerU",
|
8 |
"auto_map": {
|
9 |
"AutoModel": "modeling.MinerUModel",
|
metadata.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"language": ["zh", "en"],
|
3 |
+
"license": "apache-2.0",
|
4 |
+
"tags": ["pdf-to-markdown", "feature-extraction"],
|
5 |
+
"pipeline_tag": "feature-extraction",
|
6 |
+
"library_name": "transformers",
|
7 |
+
"task_specific_params": {
|
8 |
+
"pdf-to-markdown": {
|
9 |
+
"max_length": 1024
|
10 |
+
}
|
11 |
+
}
|
12 |
+
}
|
pipeline.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Pipeline
|
2 |
+
import torch
|
3 |
+
from typing import Union, List
|
4 |
+
import fitz
|
5 |
+
import os
|
6 |
+
from detectron2.config import get_cfg
|
7 |
+
from detectron2.engine import DefaultPredictor
|
8 |
+
|
9 |
+
class MinerUPipeline(Pipeline):
|
10 |
+
def __init__(self, model_path, **kwargs):
|
11 |
+
super().__init__(**kwargs)
|
12 |
+
# 加载Layout模型
|
13 |
+
cfg = get_cfg()
|
14 |
+
cfg.merge_from_file(os.path.join(model_path, "models/Layout/config.json"))
|
15 |
+
cfg.MODEL.WEIGHTS = os.path.join(model_path, "models/Layout/model_final.pth")
|
16 |
+
self.layout_model = DefaultPredictor(cfg)
|
17 |
+
|
18 |
+
# 加载其他模型
|
19 |
+
self.formula_detector = torch.load(os.path.join(model_path, "models/MFD/weights.pt"))
|
20 |
+
self.formula_recognizer = AutoModel.from_pretrained(os.path.join(model_path, "models/MFR/UniMERNet"))
|
21 |
+
self.table_recognizer = AutoModel.from_pretrained(os.path.join(model_path, "TabRec/StructEqTable"))
|
22 |
+
|
23 |
+
def preprocess(self, pdf_path):
|
24 |
+
"""处理PDF输入"""
|
25 |
+
doc = fitz.open(pdf_path)
|
26 |
+
pages = []
|
27 |
+
for page in doc:
|
28 |
+
# 获取页面图像
|
29 |
+
pix = page.get_pixmap()
|
30 |
+
# 转换为模型所需格式
|
31 |
+
img = torch.tensor(pix.samples).permute(2, 0, 1).float()
|
32 |
+
pages.append(img)
|
33 |
+
return pages
|
34 |
+
|
35 |
+
def _forward(self, pages):
|
36 |
+
results = []
|
37 |
+
for page in pages:
|
38 |
+
# 1. 布局分析
|
39 |
+
layout = self.layout_model(page)
|
40 |
+
|
41 |
+
# 2. 根据布局结果处理不同区域
|
42 |
+
text_regions = []
|
43 |
+
formula_regions = []
|
44 |
+
table_regions = []
|
45 |
+
|
46 |
+
for region in layout:
|
47 |
+
if region.type == "text":
|
48 |
+
text_regions.append(self._process_text(region))
|
49 |
+
elif region.type == "formula":
|
50 |
+
formula_regions.append(self._process_formula(region))
|
51 |
+
elif region.type == "table":
|
52 |
+
table_regions.append(self._process_table(region))
|
53 |
+
|
54 |
+
results.append({
|
55 |
+
"text": text_regions,
|
56 |
+
"formulas": formula_regions,
|
57 |
+
"tables": table_regions
|
58 |
+
})
|
59 |
+
|
60 |
+
return results
|
61 |
+
|
62 |
+
def _process_formula(self, region):
|
63 |
+
# 公式检测和识别
|
64 |
+
detected = self.formula_detector(region.image)
|
65 |
+
return self.formula_recognizer(detected)
|
66 |
+
|
67 |
+
def _process_table(self, region):
|
68 |
+
# 表格识别
|
69 |
+
return self.table_recognizer(region.image)
|
70 |
+
|
71 |
+
def postprocess(self, model_outputs):
|
72 |
+
"""转换为Markdown"""
|
73 |
+
markdown = []
|
74 |
+
for page in model_outputs:
|
75 |
+
# 组合文本、公式和表格
|
76 |
+
markdown.extend(page["text"])
|
77 |
+
markdown.extend([f"$${formula}$$" for formula in page["formulas"]])
|
78 |
+
markdown.extend([table.to_markdown() for table in page["tables"]])
|
79 |
+
return "\n\n".join(markdown)
|