Dmitry Chaplinsky commited on
Commit
0094748
1 Parent(s): b0b8d8f

Trying text2text demo

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. pipeline.py +3 -39
README.md CHANGED
@@ -2,7 +2,7 @@
2
  language:
3
  - uk
4
  tags:
5
- - token-classification
6
  - punctuation prediction
7
  - punctuation
8
  library_name: generic
 
2
  language:
3
  - uk
4
  tags:
5
+ - text2text-generation
6
  - punctuation prediction
7
  - punctuation
8
  library_name: generic
pipeline.py CHANGED
@@ -10,49 +10,13 @@ class PreTrainedPipeline:
10
  # This function is only called once, so do all the heavy processing I/O here"""
11
  self.model = PunctuationCapitalizationModel.from_pretrained("dchaplinsky/punctuation_uk_bert")
12
 
13
- def apply_label_to_token(self, token: str, label: str) -> str:
14
- punct, upper = label
15
-
16
- if punct != "O":
17
- token += punct
18
-
19
- if upper == "U":
20
- token = token.title()
21
-
22
- return token
23
-
24
- def __call__(self, inputs: str) -> List[Dict[str, Any]]:
25
  """
26
  Args:
27
  inputs (:obj:`str`):
28
  a string containing some text
29
  Return:
30
- A :obj:`list`:. The object returned should be like [{"entity_group": "XXX", "word": "some word", "start": 3, "end": 6, "score": 0.82}] containing :
31
- - "entity_group": A string representing what the entity is.
32
- - "word": A substring of the original string that was detected as an entity.
33
- - "start": the offset within `input` leading to `answer`. context[start:stop] == word
34
- - "end": the ending offset within `input` leading to `answer`. context[start:stop] === word
35
- - "score": A score between 0 and 1 describing how confident the model is for this entity.
36
  """
37
  inputs = inputs.strip()
38
- labels = self.model.add_punctuation_capitalization([inputs], return_labels=True)[0].split()
39
-
40
- tokens = inputs.split()
41
-
42
- res: List[Dict[str, Any]] = []
43
- offset = 0
44
- for tok, lab in zip(tokens, labels):
45
- if lab != "OO":
46
- res.append(
47
- {
48
- "entity_group": lab,
49
- "word": tok,
50
- "start": offset,
51
- "end": offset + len(tok),
52
- "score": 0.99,
53
- }
54
- )
55
-
56
- offset += len(tok) + 1
57
-
58
- return res
 
10
  # This function is only called once, so do all the heavy processing I/O here"""
11
  self.model = PunctuationCapitalizationModel.from_pretrained("dchaplinsky/punctuation_uk_bert")
12
 
13
+ def __call__(self, inputs: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
14
  """
15
  Args:
16
  inputs (:obj:`str`):
17
  a string containing some text
18
  Return:
19
+ A :obj:`str`
 
 
 
 
 
20
  """
21
  inputs = inputs.strip()
22
+ return self.model.add_punctuation_capitalization([inputs])[0]