import pandas as pd #Data Cleansing etc
import stweet as st #Tweets Scrapping tool
from wordcloud import WordCloud 
import matplotlib.pyplot as plt #Visualisation
from sklearn.feature_extraction.text import TfidfVectorizer #Vectorize text
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split #ML Data Split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score #Model Evaluation


# search_tweets_task = st.SearchTweetsTask(all_words='sepakbola until:2023-03-30 since:2023-03-15')

# output_jl_tweets = st.JsonLineFileRawOutput('output_raw_search_tweets_sepakbola.jl')
# output_jl_users = st.JsonLineFileRawOutput('output_raw_search_users_sepakbola.jl')

# output_print = st.PrintRawOutput()

# st.TweetSearchRunner(search_tweets_task=search_tweets_task,tweet_raw_data_outputs=[output_print, output_jl_tweets], user_raw_data_outputs=[output_print, output_jl_users]).run()


# tweet_new = pd.read_json('output_raw_search_tweets_sepakbola.jl', lines=True)

# tweet_clean = pd.json_normalize(tweet_new.raw_value)
# tweet_clean['created_at'] = pd.to_datetime(tweet_clean['created_at']) 

# # tweet_clean = tweet_clean[['full_text', 'created_at']]

# tweet_clean.describe()


#Jalankan perintah di bawah ini untuk mengetahui kolom data mana yang dibutuhkan (Semisal: Jumlah like, jumlah retweet dsb)

# tweet_new['raw_value'][1]


# tweet_clean.to_csv('clean_tweet_sepakbola.csv')

#kemudian, diread lagi
tweet_clean = pd.read_csv('clean_tweet_sepakbola.csv', index_col = 0, parse_dates = ['created_at'])


#Membuat kolom baru untuk menampilkan hari
tweet_clean['day'] = tweet_clean['created_at'].dt.day_name()

tweet_clean.head()


tweet_clean['full_text']

0      Instagram Story dari dua calon pemain Naturali...
1      🚨🚨🚨\n\n𝐁𝐑𝐄𝐀𝐊𝐈𝐍𝐆: Piala Dunia U-20 2023 telah r...
2      Kalian koar-koar tentang kemanusiaan di Negara...
3      Lini Masa Piala Dunia U-20 2023!🇮🇩🥹, Written b...
4      Selamat hari Kamis semua, selamat beraktivitas...
                             ...                        
435    Erick Thohir berkomitmen untuk memberikan duku...
436    Dengan pengalamannya yang luas di dunia olahra...
437    Salah satu misi besar Erick Thohir adalah memp...
438    Dengan kehadiran pak Erick Thohir dan janji un...
439    Yakin banget kalo pak ET bisa mengubah wajah s...
Name: full_text, Length: 440, dtype: object


#Visualisasi Hari Tweet
pd.crosstab(index = tweet_clean['day'] , columns = 'count').sort_values(by = 'count', ascending =False).plot(kind = 'bar')

<AxesSubplot:xlabel='day'>


#Visualisasi Engagement
#Dengan rumus Engagement = reply count + like + retweet +  quote tweet

#Subset hanya column yang dibutuhkan
tweet_baru = tweet_clean[['retweet_count', 'reply_count', 'favorite_count', 'quote_count', 'created_at', 'full_text']]

#Pilih column yang ingin dijumlahkan
column_names = ['retweet_count', 'quote_count', 'favorite_count', 'reply_count']
tweet_baru['engagement']= tweet_baru[column_names].sum(axis=1)
# tweet_baru['engagement'] = tweet_baru['retweet_count'] +  tweet_baru['quote_count'] +  tweet_baru['favorite_count'] + tweet_baru['reply_count']

pd.crosstab(index = tweet_baru['created_at'], columns= 'rate', values=tweet_baru['engagement'] , aggfunc='sum').plot()

C:\Users\USER\AppData\Local\Temp/ipykernel_14056/2533363241.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweet_baru['engagement']= tweet_baru[column_names].sum(axis=1)

<AxesSubplot:xlabel='created_at'>


#WordCloud
#Melihat kata yang sering muncul
text = tweet_clean['full_text'].tolist() 

# join the list and lowercase all the words
text = ' '.join(text).lower()

#create the wordcloud object
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)

#plot the wordcloud object
plt.imshow(wordcloud, interpolation='bilInear')
plt.axis('off')
plt.show()


tweet_sepakbola = tweet_clean.copy()
tweet_sepakbola = tweet_sepakbola[['full_text', 'created_at']]
tweet_sepakbola


tweet = pd.read_csv("data_input/tweet_indonesian.csv", index_col = 0)
tweet = tweet.dropna()
tweet.head()


#Periksa proporsional kelas positif dan negatif 
tweet['sentimen'].value_counts()

1.0    4188
0.0    2792
Name: sentimen, dtype: int64


tweet_clean = tweet['Tweet'].replace('\s+', ' ', regex=True)  # replace white space
tweet_clean=tweet_clean.replace('\.+', '', regex=True) #replace titik

special_char_list = [':', ';', '?', '}', ')', '{', '(']

for special_char in special_char_list:
    tweet_clean=tweet_clean.replace(special_char, '')
    
tweet_clean = tweet_clean.replace('((www\.[\s]+)|(https?://[^\s]+))','URL',regex=True) #replace url
tweet_clean =tweet_clean.replace(r'#([^\s]+)', r'\1', regex=True) # replace hashtag
tweet_clean =tweet_clean.replace('\s+', ' ', regex=True) 

vectorizer = TfidfVectorizer(max_features=1000)

v_data = vectorizer.fit_transform(tweet_clean.values.astype('U')).toarray()

print (v_data)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


X_train, X_test, y_train, y_test = train_test_split(v_data, tweet['sentimen'], test_size=0.2, random_state=0)

model = MultinomialNB()
model.fit(X_train,y_train)

MultinomialNB()


y_preds = model.predict(X_test)

print(confusion_matrix(y_test,y_preds))
print(classification_report(y_test,y_preds))

[[317 241]
 [ 98 740]]
              precision    recall  f1-score   support

         0.0       0.76      0.57      0.65       558
         1.0       0.75      0.88      0.81       838

    accuracy                           0.76      1396
   macro avg       0.76      0.73      0.73      1396
weighted avg       0.76      0.76      0.75      1396


# lakukan preprocesing yang sama saat test model tweet Sepakbola

tweet_more_clean = tweet_sepakbola['full_text'].replace('\s+', ' ', regex=True)  # replace white space
tweet_more_clean_2=tweet_more_clean.replace('\.+', '', regex=True) #replace titik

special_char_list = [':', ';', '?', '}', ')', '{', '(']

for special_char in special_char_list:
    tweet_more_clean2=tweet_more_clean_2.replace(special_char, '')
    
tweet_more_clean2 = tweet_more_clean2.replace('((www\.[\s]+)|(https?://[^\s]+))','URL',regex=True) #replace url
tweet_more_clean2 =tweet_more_clean2.replace(r'#([^\s]+)', r'\1', regex=True) # replace hashtag
tweet_more_clean2 =tweet_more_clean2.replace('\s+', ' ', regex=True) 

vectorizer2 = TfidfVectorizer(max_features=1000)

v_data2 = vectorizer2.fit_transform(tweet_more_clean2.values.astype('U')).toarray()

#predick untuk mendapatkan sentimen dari hasil scrap

model.predict(v_data2)

array([1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1.,
       0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0.,
       1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1.,
       1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1.,
       1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0.,
       1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0.,
       0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.])


X_train, X_test, y_train, y_test = train_test_split(v_data, tweet['sentimen'], test_size=0.2, random_state=0)

model = MultinomialNB()
model.fit(X_train,y_train)

MultinomialNB()


#untuk menyesuaikan panjang data test(tweet sepakbola) dengan y_test dari data tweet, maka perlu disesuaikan jumlah rownya
y_test_custom = y_test.head(n=440)


y_preds = model.predict(v_data2)
y_preds
print(confusion_matrix(y_test_custom,y_preds))
print(classification_report(y_test_custom,y_preds))

[[ 46 128]
 [ 68 198]]
              precision    recall  f1-score   support

         0.0       0.40      0.26      0.32       174
         1.0       0.61      0.74      0.67       266

    accuracy                           0.55       440
   macro avg       0.51      0.50      0.49       440
weighted avg       0.53      0.55      0.53       440

	created_at	id	id_str	full_text	truncated	display_text_range	source	in_reply_to_status_id	in_reply_to_status_id_str	in_reply_to_user_id	...	card.users.944180887171743745.advertiser_account_type	card.users.944180887171743745.advertiser_account_service_levels	card.users.944180887171743745.profile_interstitial_type	card.users.944180887171743745.business_profile_state	card.users.944180887171743745.translator_type	card.users.944180887171743745.withheld_in_countries	card.users.944180887171743745.followed_by	card.users.944180887171743745.ext.highlightedLabel.ttl	card.users.944180887171743745.require_some_consent	day
0	2023-03-29 16:26:14+00:00	1641114553542123521	1641114553542123521	Instagram Story dari dua calon pemain Naturali...	False	[0, 215]	<a href="http://twitter.com/download/iphone" r...	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Wednesday
1	2023-03-29 15:07:18+00:00	1641094689431633922	1641094689431633922	🚨🚨🚨\n\n𝐁𝐑𝐄𝐀𝐊𝐈𝐍𝐆: Piala Dunia U-20 2023 telah r...	False	[0, 195]	<a href="http://twitter.com/download/android" ...	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Wednesday
2	2023-03-28 09:55:25+00:00	1640653811713970176	1640653811713970176	Kalian koar-koar tentang kemanusiaan di Negara...	False	[0, 229]	<a href="http://twitter.com/download/iphone" r...	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Tuesday
3	2023-03-29 15:58:32+00:00	1641107581203787777	1641107581203787777	Lini Masa Piala Dunia U-20 2023!🇮🇩🥹, Written b...	False	[0, 273]	<a href="http://twitter.com/download/iphone" r...	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Wednesday
4	2023-03-29 21:32:04+00:00	1641191518819213312	1641191518819213312	Selamat hari Kamis semua, selamat beraktivitas...	False	[0, 74]	<a href="http://twitter.com/download/iphone" r...	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Wednesday

	sentimen	Tweet
5328.0	0.0	doa rezeki tak putus inna haa zaa larizquna ma...
5329.0	0.0	makasih loh ntar kita bagi hasil aku 99 9 sisa...
5330.0	0.0	ya aku akan menjadi satu satunya bukan nomor s...
5331.0	0.0	i dont know why but these zikir sangat membant...
5332.0	0.0	aah kamu aja mas aku lebih suka diayomi

Persiapan¶

Scrap Tweets¶

CSV Save¶

Exploratory Data Analysis¶

Prediksi Sentimen pada Tweet¶

Evaluasi Model¶