File size: 3,159 Bytes
8546e4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
import tensorflow as tf
import numpy as np
import transformers
from transformers import AutoTokenizer,TFBertForSequenceClassification
import re
import string
import preprocessor as p
from tensorflow import keras

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")

# Define the maximum sequence length
max_seq = 110

# Function to preprocess the data
def preprocess_data(data):
    data = data.tolist()  # Convert numpy array to list
    processed_data = []
    for sentence in data:
        sentence = text_preprocess(sentence)
        encoded_data = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_seq,
            padding="max_length",
            truncation=True,
            return_tensors="tf"
        )
        processed_data.append((encoded_data['input_ids'], encoded_data['attention_mask']))
    return processed_data

# Function to preprocess the sentence
def text_preprocess(sentence):
    pattern = r'[0-9]'
    for punctuation in string.punctuation:
        sentence = p.clean(sentence)
        sentence = re.sub(r'[^a-zA-Z0-9\s]', '', sentence)
        sentence = re.sub(r'http[s]?://\S+', '', sentence)
        sentence = sentence.replace(punctuation, '')
        sentence = re.sub(pattern, '', sentence)
        sentence = re.sub(r'\r?\n|\r', '', sentence)
        sentence = sentence.encode('ascii', 'ignore').decode('ascii')
        sentence = sentence.lower()
    return sentence

# Function to perform sentiment prediction
def predict_sentiment(sentence):
    preprocessed_sentence = preprocess_data(np.array([sentence]))
    input_ids, attention_mask = preprocessed_sentence[0]
    prediction = model.predict([input_ids, attention_mask])
    predicted_label = np.argmax(prediction)
    label_mapping = {0: "negative", 1: "neutral", 2: "positive"}
    predicted_label = label_mapping[predicted_label]
    return predicted_label

# Streamlit app
def main():
    st.title("Analisis Sentimen Berbahasa Indonesia")
    sentence = st.text_input("Masukkan teks disini:")
    if st.button("Cek Kalimat"):
        st.write("Hasil Klasifikasi:")
        sentiment = predict_sentiment(sentence)
        if sentiment == "positive":
            st.markdown('<div style="background-color: green; padding: 10px; color:white;">Sentiment: positive</div>', unsafe_allow_html=True)
        elif sentiment == "negative":
            st.markdown('<div style="background-color: #FE4365; padding: 10px; color:white;">Sentiment: negative</div>', unsafe_allow_html=True)
        elif sentiment == "neutral":
            st.markdown('<div style="background-color: #FDFD96; padding: 10px; color: black;">Sentiment: neutral</div>', unsafe_allow_html=True)


if __name__ == '__main__':
    # Register the custom objects using custom_object_scope
    with keras.utils.custom_object_scope({'TFBertForSequenceClassification': transformers.TFBertForSequenceClassification}):
        # Load the saved model
        model = TFBertForSequenceClassification.from_pretrained('muhfrrazi/IndoBERT-Sentiment-Analysist_Dataset-Indonesia')

        main()