kitjesen commited on
Commit
8afa9a1
·
verified ·
1 Parent(s): 9f541bc

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +38 -3
  2. app.py +1 -0
  3. config.json +28 -0
  4. metadata.json +6 -0
  5. model_loader.py +31 -0
  6. requirements.txt +7 -0
README.md CHANGED
@@ -1,3 +1,38 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MinerU PDF to Markdown Model
2
+
3
+ 这个模型可以将PDF文档转换为Markdown格式。
4
+
5
+ ## 模型架构
6
+ MinerU使用多模型组合架构:
7
+ - Layout: 文档布局分析
8
+ - MFD: 数学公式检测
9
+ - MFR: 数学公式识别
10
+ - TabRec: 表格识别与重建
11
+
12
+ ## 使用方法
13
+
14
+ ```python
15
+ from transformers import pipeline
16
+
17
+ # 初始化转换器
18
+ converter = pipeline("pdf-to-markdown", model="your-username/MinerU")
19
+
20
+ # 转换PDF文件
21
+ markdown = converter("document.pdf")
22
+ ```
23
+
24
+ ## 模型信息
25
+ - 任务: PDF到Markdown转换
26
+ - 框架: PyTorch
27
+ - 许可: Apache 2.0
28
+
29
+ ## 系统要求
30
+ - Python >= 3.7
31
+ - PyTorch >= 1.9.0
32
+ - transformers >= 4.28.0
33
+ - detectron2
34
+
35
+ ## 限制说明
36
+ - 支持的最大页数: XX页
37
+ - 支持的PDF最大大小: XX MB
38
+ - 支持的语言: 中文、英文
app.py ADDED
@@ -0,0 +1 @@
 
 
1
+
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": ["MinerUModel"],
3
+ "model_type": "mineru",
4
+ "framework": "pytorch",
5
+ "task": "document-conversion",
6
+ "pipeline_tag": "document-conversion",
7
+ "submodels": {
8
+ "layout": {
9
+ "type": "detectron2",
10
+ "path": "models/Layout/model_final.pth",
11
+ "config": "models/Layout/config.json"
12
+ },
13
+ "formula_detection": {
14
+ "type": "pytorch",
15
+ "path": "models/MFD/weights.pt"
16
+ },
17
+ "formula_recognition": {
18
+ "type": "transformers",
19
+ "path": "models/MFR/UniMERNet",
20
+ "model_type": "bert"
21
+ },
22
+ "table_recognition": {
23
+ "type": "transformers",
24
+ "path": "models/TabRec/StructEqTable",
25
+ "model_type": "t5"
26
+ }
27
+ }
28
+ }
metadata.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "language": ["zh", "en"],
3
+ "license": "apache-2.0",
4
+ "tags": ["document-conversion", "pdf-to-markdown"],
5
+ "pipeline_tag": "document-conversion"
6
+ }
model_loader.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModel, AutoTokenizer
3
+ from detectron2.config import get_cfg
4
+ from detectron2.engine import DefaultPredictor
5
+ import os
6
+
7
+ class MinerUModelLoader:
8
+ @staticmethod
9
+ def load_models(base_path):
10
+ models = {}
11
+
12
+ # Layout模型加载
13
+ cfg = get_cfg()
14
+ cfg.merge_from_file(os.path.join(base_path, "models/Layout/config.json"))
15
+ cfg.MODEL.WEIGHTS = os.path.join(base_path, "models/Layout/model_final.pth")
16
+ models["layout"] = DefaultPredictor(cfg)
17
+
18
+ # 公式检测模型
19
+ models["formula_detector"] = torch.load(os.path.join(base_path, "models/MFD/weights.pt"))
20
+
21
+ # 公式识别模型
22
+ models["formula_recognizer"] = AutoModel.from_pretrained(
23
+ os.path.join(base_path, "models/MFR/UniMERNet")
24
+ )
25
+
26
+ # 表格识别模型
27
+ models["table_recognizer"] = AutoModel.from_pretrained(
28
+ os.path.join(base_path, "models/TabRec/StructEqTable")
29
+ )
30
+
31
+ return models
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers>=4.28.0
2
+ torch>=1.9.0
3
+ PyMuPDF
4
+ detectron2
5
+ numpy
6
+ opencv-python
7
+ pandas