dchaplinsky
/

punctuation_uk_bert

Text2Text Generation

punctuation prediction

Model card Files Files and versions Community

Dmitry Chaplinsky commited on Sep 17, 2022

Commit

0094748

•

1 Parent(s): b0b8d8f

Trying text2text demo

Files changed (2) hide show

README.md +1 -1
pipeline.py +3 -39

README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 language:
   - uk
 tags:
-  - token-classification
   - punctuation prediction
   - punctuation
 library_name: generic

 language:
   - uk
 tags:
+  - text2text-generation
   - punctuation prediction
   - punctuation
 library_name: generic

pipeline.py CHANGED Viewed

@@ -10,49 +10,13 @@ class PreTrainedPipeline:
         # This function is only called once, so do all the heavy processing I/O here"""
         self.model = PunctuationCapitalizationModel.from_pretrained("dchaplinsky/punctuation_uk_bert")
-    def apply_label_to_token(self, token: str, label: str) -> str:
-        punct, upper = label
-        if punct != "O":
-            token += punct
-        if upper == "U":
-            token = token.title()
-        return token
-    def __call__(self, inputs: str) -> List[Dict[str, Any]]:
         """
         Args:
             inputs (:obj:`str`):
                 a string containing some text
         Return:
-            A :obj:`list`:. The object returned should be like [{"entity_group": "XXX", "word": "some word", "start": 3, "end": 6, "score": 0.82}] containing :
-                - "entity_group": A string representing what the entity is.
-                - "word": A substring of the original string that was detected as an entity.
-                - "start": the offset within `input` leading to `answer`. context[start:stop] == word
-                - "end": the ending offset within `input` leading to `answer`. context[start:stop] === word
-                - "score": A score between 0 and 1 describing how confident the model is for this entity.
         """
         inputs = inputs.strip()
-        labels = self.model.add_punctuation_capitalization([inputs], return_labels=True)[0].split()
-        tokens = inputs.split()
-        res: List[Dict[str, Any]] = []
-        offset = 0
-        for tok, lab in zip(tokens, labels):
-            if lab != "OO":
-                res.append(
-                    {
-                        "entity_group": lab,
-                        "word": tok,
-                        "start": offset,
-                        "end": offset + len(tok),
-                        "score": 0.99,
-                    }
-                )
-            offset += len(tok) + 1
-        return res

         # This function is only called once, so do all the heavy processing I/O here"""
         self.model = PunctuationCapitalizationModel.from_pretrained("dchaplinsky/punctuation_uk_bert")
+    def __call__(self, inputs: str) -> str:
         """
         Args:
             inputs (:obj:`str`):
                 a string containing some text
         Return:
+            A :obj:`str`
         """
         inputs = inputs.strip()
+        return self.model.add_punctuation_capitalization([inputs])[0]