feihu.hf commited on
Commit
602373e
1 Parent(s): 7f72bb4

update file types

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. patching.py +32 -3
  3. web_ui.py +2 -1
app.py CHANGED
@@ -71,7 +71,7 @@ def app_gui():
71
  'max_retries': 10,
72
  }},
73
  name='Qwen-Turbo-1M',
74
- description='Qwen-Turbo natively supports input length of up to 1M tokens. You can upload documents for Q&A, supporting file types: PDF/Word/PPT/TXT/HTML.',
75
  rag_cfg={'max_ref_token': 1000000, 'rag_searchers': ['no_search']},
76
  )
77
  chatbot_config = {
 
71
  'max_retries': 10,
72
  }},
73
  name='Qwen-Turbo-1M',
74
+ description='Qwen-Turbo natively supports input length of up to 1M tokens. You can upload documents for Q&A (eg., pdf/docx/pptx/txt/html).',
75
  rag_cfg={'max_ref_token': 1000000, 'rag_searchers': ['no_search']},
76
  )
77
  chatbot_config = {
patching.py CHANGED
@@ -68,6 +68,35 @@ def memory_run(self, messages: List[Message], lang: str = 'en', **kwargs) -> Ite
68
 
69
  Memory._run = memory_run
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def SimpleDocParser_call(self, params: Union[str, dict], **kwargs) -> Union[str, list]:
73
  params = self._verify_json_format_args(params)
@@ -88,7 +117,7 @@ def SimpleDocParser_call(self, params: Union[str, dict], **kwargs) -> Union[str,
88
  time1 = time.time()
89
 
90
  f_type = get_file_type(path)
91
- if f_type in PARSER_SUPPORTED_FILE_TYPES:
92
  if path.startswith('https://') or path.startswith('http://') or re.match(
93
  r'^[A-Za-z]:\\', path) or re.match(r'^[A-Za-z]:/', path):
94
  path = path
@@ -108,7 +137,7 @@ def SimpleDocParser_call(self, params: Union[str, dict], **kwargs) -> Union[str,
108
  parsed_file = parse_word(path, self.extract_image)
109
  elif f_type == 'pptx':
110
  parsed_file = parse_ppt(path, self.extract_image)
111
- elif f_type == 'txt':
112
  parsed_file = parse_txt(path)
113
  elif f_type == 'html':
114
  parsed_file = parse_html_bs(path, self.extract_image)
@@ -120,7 +149,7 @@ def SimpleDocParser_call(self, params: Union[str, dict], **kwargs) -> Union[str,
120
  parsed_file = parse_excel(path, self.extract_image)
121
  else:
122
  raise ValueError(
123
- f'Failed: The current parser does not support this file type! Supported types: {"/".join(PARSER_SUPPORTED_FILE_TYPES)}'
124
  )
125
  for page in parsed_file:
126
  for para in page['content']:
 
68
 
69
  Memory._run = memory_run
70
 
71
+ common_programming_language_extensions = [
72
+ "py", # Python
73
+ "java", # Java
74
+ "cpp", # C++
75
+ "c", # C
76
+ "h", # C/C++ 头文件
77
+ "cs", # C#
78
+ "js", # JavaScript
79
+ "ts", # TypeScript
80
+ "rb", # Ruby
81
+ "php", # PHP
82
+ "swift", # Swift
83
+ "go", # Go
84
+ "rs", # Rust
85
+ "kt", # Kotlin
86
+ "scala", # Scala
87
+ "m", # Objective-C
88
+ "css", # CSS
89
+ "sql", # SQL
90
+ "sh", # Shell
91
+ "pl", # Perl
92
+ "r", # R
93
+ "jl", # Julia
94
+ "dart", # Dart
95
+ "json", # JSON
96
+ "xml", # XML
97
+ "yml", # YAML
98
+ "toml", # TOML
99
+ ]
100
 
101
  def SimpleDocParser_call(self, params: Union[str, dict], **kwargs) -> Union[str, list]:
102
  params = self._verify_json_format_args(params)
 
117
  time1 = time.time()
118
 
119
  f_type = get_file_type(path)
120
+ if f_type in PARSER_SUPPORTED_FILE_TYPES + common_programming_language_extensions:
121
  if path.startswith('https://') or path.startswith('http://') or re.match(
122
  r'^[A-Za-z]:\\', path) or re.match(r'^[A-Za-z]:/', path):
123
  path = path
 
137
  parsed_file = parse_word(path, self.extract_image)
138
  elif f_type == 'pptx':
139
  parsed_file = parse_ppt(path, self.extract_image)
140
+ elif f_type == 'txt' or f_type in common_programming_language_extensions:
141
  parsed_file = parse_txt(path)
142
  elif f_type == 'html':
143
  parsed_file = parse_html_bs(path, self.extract_image)
 
149
  parsed_file = parse_excel(path, self.extract_image)
150
  else:
151
  raise ValueError(
152
+ f'Failed: The current parser does not support this file type! Supported types: {"/".join(PARSER_SUPPORTED_FILE_TYPES + common_programming_language_extensions)}'
153
  )
154
  for page in parsed_file:
155
  for para in page['content']:
web_ui.py CHANGED
@@ -10,6 +10,7 @@ from qwen_agent.gui.utils import convert_fncall_to_text, convert_history_to_chat
10
  from qwen_agent.llm.schema import CONTENT, FILE, IMAGE, NAME, ROLE, USER, Message
11
  from qwen_agent.log import logger
12
  from qwen_agent.utils.utils import print_traceback
 
13
 
14
  class WebUI:
15
  """A Common chatbot application for agent."""
@@ -129,7 +130,7 @@ class WebUI:
129
  'display': True
130
  }])
131
 
132
- input = mgr.MultimodalInput(placeholder=self.input_placeholder, upload_button_props=dict(file_types=[".pdf", ".doc", ".docx", ".ppt", ".pptx", ".txt", ".html"]))
133
 
134
  with gr.Column(scale=1):
135
  if len(self.agent_list) > 1:
 
10
  from qwen_agent.llm.schema import CONTENT, FILE, IMAGE, NAME, ROLE, USER, Message
11
  from qwen_agent.log import logger
12
  from qwen_agent.utils.utils import print_traceback
13
+ from patching import common_programming_language_extensions
14
 
15
  class WebUI:
16
  """A Common chatbot application for agent."""
 
130
  'display': True
131
  }])
132
 
133
+ input = mgr.MultimodalInput(placeholder=self.input_placeholder, upload_button_props=dict(file_types=[".pdf", ".docx", ".pptx", ".txt", ".html", ".csv", ".tsv", ".xlsx", ".xls"] + ["." + file_type for file_type in common_programming_language_extensions]))
134
 
135
  with gr.Column(scale=1):
136
  if len(self.agent_list) > 1: