Spaces:
Running
Running
import streamlit as st | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import pandas as pd | |
from fpdf import FPDF | |
# Interface utilisateur | |
st.set_page_config( | |
page_title="Traduction d'une phrase en pictogrammes ARASAAC", | |
page_icon="📝", | |
layout="wide" | |
) | |
# Charger le modèle et le tokenizer | |
# checkpoint = "Propicto/t2p-t5-large-orfeo" | |
checkpoint = "Propicto/t2p-nllb-200-distilled-600M-all" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) | |
# Lire le lexique | |
def read_lexicon(lexicon): | |
df = pd.read_csv(lexicon, sep='\t') | |
df['keyword_no_cat'] = df['lemma'].str.split(' #').str[0].str.strip().str.replace(' ', '_') | |
return df | |
lexicon = read_lexicon("lexicon.csv") | |
# Processus de sortie de la traduction | |
def process_output_trad(pred): | |
return pred.split() | |
def get_id_picto_from_predicted_lemma(df_lexicon, lemma): | |
if lemma.endswith("!"): | |
lemma = lemma[:-1] | |
id_picto = df_lexicon.loc[df_lexicon['keyword_no_cat'] == lemma, 'id_picto'].tolist() | |
return (id_picto[0], lemma) if id_picto else (0, lemma) | |
# Génération du contenu HTML pour afficher les pictogrammes | |
def generate_html(ids): | |
html_content = '<html><head><style>' | |
html_content += ''' | |
figure { | |
display: inline-block; | |
text-align: center; | |
font-family: Arial, sans-serif; | |
margin: 0; | |
} | |
figcaption { | |
color: black; | |
background-color: white; | |
border-radius: 5px; | |
} | |
img { | |
background-color: white; | |
margin: 0; | |
padding: 0; | |
border-radius: 6px; | |
} | |
''' | |
html_content += '</style></head><body>' | |
for picto_id, lemma in ids: | |
if picto_id != 0: # ignore invalid IDs | |
img_url = f"https://static.arasaac.org/pictograms/{picto_id}/{picto_id}_500.png" | |
html_content += f''' | |
<figure> | |
<img src="{img_url}" alt="{lemma}" width="200" height="200"/> | |
<figcaption>{lemma}</figcaption> | |
</figure> | |
''' | |
html_content += '</body></html>' | |
return html_content | |
def generate_pdf(ids): | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_auto_page_break(auto=True, margin=15) | |
for picto_id, lemma in ids: | |
if picto_id != 0: # ignore invalid IDs | |
img_url = f"https://static.arasaac.org/pictograms/{picto_id}/{picto_id}_500.png" | |
pdf.image(img_url, x=None, y=None, w=50, h=50) | |
pdf.ln(55) | |
pdf.set_font("Arial", size=12) | |
pdf.cell(200, 10, txt=lemma, ln=True, align='C') | |
pdf_path = "pictograms.pdf" | |
pdf.output(pdf_path) | |
return pdf_path | |
st.title("Traduction d'une phrase en pictogrammes ARASAAC") | |
sentence = st.text_input("Entrez une phrase en français:") | |
if sentence: | |
inputs = tokenizer(sentence, return_tensors="pt").input_ids | |
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) | |
pred = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
sentence_to_map = process_output_trad(pred) | |
pictogram_ids = [get_id_picto_from_predicted_lemma(lexicon, lemma) for lemma in sentence_to_map] | |
html = generate_html(pictogram_ids) | |
st.components.v1.html(html, height=800, scrolling=True) | |
# Container to hold the download button | |
download_container = st.container() | |
with download_container: | |
pdf_path = generate_pdf(pictogram_ids) | |
with open(pdf_path, "rb") as pdf_file: | |
st.download_button(label="Télécharger la traduction en PDF", data=pdf_file, file_name="pictograms.pdf", mime="application/pdf") |