Twitch Chat Classifier¶
Project to examine the potential of Machine Learning to classify twitch chat messages as "belonging" to a certain broadcaster.
Charlie Schatmeyer, 2025
Background¶
Twitch.tv is a livestreaming website, similar to services on youtube, instagram, and tiktok. What may seperate Twitch from all other livestreaming platforms is Twitch Chat, where anonymized usernames, 3rd party plugins (such as BTTV), and an long organic culture of emotes and mannerisms makes the chatting experience more unique than any. To that end, most streamers that attempt to cultivate an active chat create a unique community, often so unique that the streamer can be identified by the chat alone. At least, this project aims to quantify that.
Though this project, I aim to explore the potential of, based on the input of a single chat message of no more than 255 characters, predicting the streamer that the message comes from. I'll look into binary classification, larger-scale classification, and as a bonus, graphing the simularities between twitch chats in order to get a sense for the different communities and ways of chatting on Twitch.
This project aims to be an exploration of possibilities, without the expenses or investments into a full application. With that said, there is strong potential to use the final classifier as a structural element in a game (As an example, give the player a streamer, and they must act like that chat, or get the classifier to predict them as that chat, within 10 messages). The network graph also shows potential, where an expanded graph showing more than 50 streamers (and the bias of a single vod) could be a popular visualization.
As far as my research has shown me, no prior work has been done on the classification of twitch chat messages. Work has been done w.r.t. tweet classification, which will serve as a backbone for this work.
Import¶
import subprocess
import requests
import json
import os
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neural_network import MLPClassifier
import networkx as nx
import time
from joblib import dump, load
from sklearn.svm import SVC
from adjustText import adjust_text
import community
from xgboost import XGBClassifier
from sklearn.decomposition import TruncatedSVD
Functions¶
Download Chat¶
# Download the chat of video_id v to output file o
# Saves in format of [username: message]
def download_chat(v, o, collision="Exit"):
command = [
"./TwitchDownloaderCLI",
"chatdownload",
"--id", v,
"-o", o,
"--timestamp-format", "None",
"--threads", "32",
"--collision", collision
]
try:
subprocess.run(command, check=True)
except subprocess.CalledProcessError as e:
print("Error downloading chat:", e)
# Download chats from list of video_ids, appends txt files together
def download_chat_list(ids, output):
# download individual chats
i = 1
for v in ids:
download_chat(v, f"v{i}.txt")
i += 1
# append chats together
with open(output, "a") as outfile:
for i in range(1, len(ids) + 1):
with open(f"v{i}.txt", "r") as infile:
outfile.write(infile.read())
# remove individual chat files
for i in range(1, len(ids) + 1):
os.remove(f"v{i}.txt")
Get Twitch OAuth¶
client_id = "removed"
client_secret = "removed"
def get_oauth_token():
global oauth_token
url = "https://id.twitch.tv/oauth2/token"
params = {
"client_id": client_id,
"client_secret": client_secret,
"grant_type": "client_credentials"
}
response = requests.post(url, params=params)
oauth_token = response.json()["access_token"]
Get User ID¶
def get_user_id(username):
r = requests.get(f"https://api.twitch.tv/helix/users?login={username}",
headers={f"Client-ID":client_id, f"Authorization":f"Bearer {oauth_token}"})
j = json.loads(r.text)
id = j['data'][0]['id']
return id
Get User VODs¶
# returns last 5 vod ids of user_id
def user_vod_ids(user_id):
r = requests.get(f"https://api.twitch.tv/helix/videos?user_id={user_id}&type=archive&first=5",
headers={f"Client-ID":client_id, f"Authorization":f"Bearer {oauth_token}"})
j = json.loads(r.text)
return [v['id'] for v in j['data']]
Load txts to Dataframe¶
# gets txt files from directory, loads lines into df labelled by file name
def load_data_from_directory(directory):
data = []
labels = []
for file in os.listdir(directory):
if file.endswith('.txt'):
label = os.path.splitext(file)[0] # Use file name as label
with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
lines = f.readlines()
data.extend(lines)
labels.extend([label] * len(lines))
return pd.DataFrame({'text': data, 'label': labels})
Downsize¶
# Downsize dataframe to max_rows per label
def downsize_dataframe(df, label_column, max_rows=10000):
downsized_df = df.groupby(label_column).apply(lambda x:
x.sample(min(len(x), max_rows))).reset_index(drop=True)
return downsized_df
Clean Chat Messages¶
# Preprocess text data
def preprocess_text(text):
text = text.str.split(":").str[1] # Remove usernames
text = text.str.lower() # Lowercase
text = text.str.replace(r'[^\w\s]', '', regex=True) # Remove punctuation
text = text.str.strip() # Remove leading/trailing whitespace
return text
# Weaker preprocessing (only remove usernames, whitespace, no removal of punctuation or case)
def weak_preprocess_text(text):
text = text.str.split(":").str[1] # Remove usernames
text = text.str.strip() # Remove leading/trailing whitespace
return text
Cluster Visualization¶
# t-SNE Visualization
def visualize_with_tsne(features, labels):
tsne = TSNE(n_components=2, random_state=42)
reduced_features = tsne.fit_transform(features)
df_tsne = pd.DataFrame({
'x': reduced_features[:, 0],
'y': reduced_features[:, 1],
'label': labels
})
plt.figure(figsize=(12, 8))
sns.scatterplot(
x='x', y='y', hue='label', data=df_tsne, alpha=0.7
)
plt.title('t-SNE Visualization')
plt.legend(loc='best', bbox_to_anchor=(1, 1))
plt.show()
# PCA Visualization
def visualize_with_pca(features, labels):
pca = PCA(n_components=2, random_state=42)
reduced_features = pca.fit_transform(features)
df_pca = pd.DataFrame({
'x': reduced_features[:, 0],
'y': reduced_features[:, 1],
'label': labels
})
plt.figure(figsize=(12, 8))
sns.scatterplot(
x='x', y='y', hue='label', data=df_pca, alpha=0.7
)
plt.title('PCA Visualization')
plt.legend(loc='best', bbox_to_anchor=(1, 1))
plt.show()
Document Similarities¶
# Jaccard Similarity
# Takes two lists of words, returns float
def jaccard_similarity(list1, list2):
intersection = len(list(set(list1).intersection(list2)))
union = (len(set(list1)) + len(set(list2))) - intersection
return float(intersection) / union
# Cosine Similarity
# Takes two lists of words, returns float
def cosine_sim(list1, list2):
# Convert lists into strings (TF-IDF requires raw text input)
text1 = " ".join(list1)
text2 = " ".join(list2)
# Vectorize and calc
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([text1, text2])
cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
return cosine_sim[0][0]
Predict Word¶
# Predict Word
# Takes a word, label mapping, model, and vectorizer, returns a prediction
def predict_word(word, model, vect, label_mapping):
return list(label_mapping.keys())[model.predict(vect.transform([word]))[0]]
# Predict Word with Confidence
# Takes a word, label mapping, model, and vectorizer, returns prediction and confidence
def predict_word_confidence(word, model, vect, label_mapping):
prediction = list(label_mapping.keys())[model.predict(vect.transform([word]))[0]]
confidence = model.predict_proba(vect.transform([word])).max()
#all_predictions = {list(label_mapping.keys())[i]: model.predict_proba(vect.transform([word]))[0][i] for i in range(len(label_mapping))}
return prediction, confidence
# Get Conifdence for all Labels given Word
# Takes a word, label mapping, model, and vectorizer, returns all predictions and their confidence
def get_confidence_for_all_labels(word, model, vect, label_mapping):
all_predictions = {list(label_mapping.keys())[i]: model.predict_proba(vect.transform([word]))[0][i] for i in range(len(label_mapping))}
return all_predictions
# for each streamer label, select k random messages in that label and get the average confidence of all labels from predicting the k messages,
# return a dataframe with each streamer label, the label's highest average confidence,
# the label with the highest average confidence, and the average confidence of the correct label
# also return the accuracy (% of correct labels)
def simulate_k_predictions_for_each_label(data, k, model, vectorizer, label_mapping):
results = []
for label in label_mapping.keys():
# get k random messages from the label
sample = data[data['label'] == label].sample(n=k, random_state=42)
# get the average confidence of all labels from predicting the k messages
# create sum_confidences dictionary, with labels as keys and 0 as values
sum_confidences = {l: 0 for l in label_mapping.keys()}
for text in sample['text']:
# get dictionary of every label and its confidence
text_confidences = get_confidence_for_all_labels(text, model, vectorizer, label_mapping)
# sum the confidences for each label
for l, c in text_confidences.items():
sum_confidences[l] += c
# get the average confidence for each label
avg_confidences = {l: sum_confidences[l] / k for l in label_mapping.keys()}
# get the average confidence of the correct label
correct_avg_confidence = avg_confidences[label]
# get the label with the highest average confidence
highest_avg_confidence_label = max(avg_confidences, key=avg_confidences.get)
highest_avg_confidence = avg_confidences[highest_avg_confidence_label]
results.append({
'label': label,
'highest_avg_confidence': highest_avg_confidence,
'highest_avg_confidence_label': highest_avg_confidence_label,
'correct_avg_confidence': correct_avg_confidence
})
# calculate accuracy
correct_predictions = sum(1 for r in results if r['label'] == r['highest_avg_confidence_label'])
accuracy = correct_predictions / len(results)
return pd.DataFrame(results), accuracy
Proof of Concept - NL vs xQc¶
Northernlion (NL) and xQc are streamers I hand picked to serve as a proof of concept for chat classification. The streamers, and their chats, are close to polar opposites. NL appeals to an older audience, playing puzzle/strategy/thinking games and overall having a calm vibe. xQc brings a more chaotic enviroment, reflecting a younger, more hyper-online audience. With that said, they are still similar in lots of ways. Both are among the top 50 English Streamers on the website, amassing 10k and 40k average viewers. Their chats are also both English Speaking, and share an amount of Twitch Brainrot, or a propensity to use relevant memes and emotes. This balanced challenge gives an idea on how a full classifier might perform.
Using TwitchDownloaderCLI, each chat is downloaded based on a vod from each streamer hand-selected to be representative (not covering a special event, going through a typical broadcast). Each individual chat is loaded as a labelled row to a dataframe. The text is preprocessed in the standard way (though is tested again without removing special characters or capitalization, as in twitch chat, "?????" can be a common message, and there is a difference between "LuL" and "LUL"). Tfidf is used to vectorize the text (as meaning-based solutions are less useful in this context) with several different parameters. Naive Bayes and Logistic Regression are chosen as models due to their inexpensiveness and overall strong performance. KMeans Clustering is also tested as a failed experiment with low accuracy.
Download Chat¶
# Basic test to download Northernlion's Chat
video_id = "2362321171" # NL 1/24/25
output_file = "txts/nl.txt"
# download_chat(video_id, output_file)
# Basic test to download xQc's Chat
video_id = "2362597129" #xQc 1/24/25
output_file = "txts/xqc.txt"
# download_chat(video_id, output_file)
Preprocess¶
# Load data
data = load_data_from_directory('txts')
# Downsize data to 10,000 chats per streamer
data = downsize_dataframe(data, 'label', 10000)
# Preprocess text
data['text'] = preprocess_text(data['text'])
# Encode labels numerically
label_mapping = {label: idx for idx, label in enumerate(data['label'].unique())}
data['label_encoded'] = data['label'].map(label_mapping)
Word Cloud¶
Get a visual idea of the messages sent in each chat
# Generate word cloud for only NL
wc = WordCloud().generate(' '.join(data[data['label'] == 'nl']['text']))
plt.axes().set_axis_off()
plt.imshow(wc)
<matplotlib.image.AxesImage at 0x145b7bc50>
# Generate word cloud for xQc
wc = WordCloud().generate(' '.join(data[data['label'] == 'xqc']['text']))
plt.axes().set_axis_off()
plt.imshow(wc)
<matplotlib.image.AxesImage at 0x1465e0ad0>
Get Overall Chat Similarity¶
# Make array of all individual words
nl_words = ' '.join(data[data['label'] == 'nl']['text']).split(' ')
xqc_words = ' '.join(data[data['label'] == 'xqc']['text']).split(' ')
print(f"Jaccard Similarity: {jaccard_similarity(xqc_words, nl_words)}")
Jaccard Similarity: 0.17324976348155155
print(f"Cosine Similarity: {cosine_sim(xqc_words, nl_words)}")
Cosine Similarity: 0.33939988175303515
Classification, Basic Tf-idf¶
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000) # Max_Features chosen based on performance/speed tradeoff after testing multiple options
features = vectorizer.fit_transform(data['text']).toarray()
labels = data['label_encoded'].values
# Split data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
# Naive Bayes Classifier
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)
# Evaluate Naive Bayes Classifier
y_pred = nb_clf.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred, target_names=label_mapping.keys()))
Naive Bayes Accuracy: 0.85325 Naive Bayes Classification Report: precision recall f1-score support nl 0.89 0.81 0.85 2019 xqc 0.82 0.89 0.86 1981 accuracy 0.85 4000 macro avg 0.86 0.85 0.85 4000 weighted avg 0.86 0.85 0.85 4000
# Logistic Regression Classifier
lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
lr_clf.fit(X_train, y_train)
# Evaluate Logistic Regression Classifier
y_pred = lr_clf.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred, target_names=label_mapping.keys()))
Logistic Regression Accuracy: 0.849 Logistic Regression Classification Report: precision recall f1-score support nl 0.88 0.81 0.84 2019 xqc 0.82 0.89 0.85 1981 accuracy 0.85 4000 macro avg 0.85 0.85 0.85 4000 weighted avg 0.85 0.85 0.85 4000
# Random Forest Classifier
rf_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
rf_clf.fit(X_train, y_train)
# Evaluate Random Forest Classifier
y_pred = rf_clf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred, target_names=label_mapping.keys()))
Random Forest Accuracy: 0.84975 Random Forest Classification Report: precision recall f1-score support nl 0.82 0.91 0.86 2019 xqc 0.89 0.79 0.84 1981 accuracy 0.85 4000 macro avg 0.85 0.85 0.85 4000 weighted avg 0.85 0.85 0.85 4000
Classification, n-Gram Tf-idf¶
# Vectorize with character n-grams
n_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
n_features = n_vectorizer.fit_transform(data['text']).toarray()
n_labels = data['label_encoded'].values
# Split data
n_X_train, n_X_test, n_y_train, n_y_test = train_test_split(n_features, n_labels, test_size=0.2, random_state=42)
# Naive Bayes Classifier
n_nb_clf = MultinomialNB()
n_nb_clf.fit(n_X_train, n_y_train)
n_y_pred = n_nb_clf.predict(n_X_test)
n_nb_accuracy = accuracy_score(n_y_test, n_y_pred)
print("Naive Bayes Accuracy:", n_nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(n_y_test, n_y_pred, target_names=label_mapping.keys()))
Naive Bayes Accuracy: 0.85325 Naive Bayes Classification Report: precision recall f1-score support nl 0.89 0.81 0.85 2019 xqc 0.82 0.89 0.86 1981 accuracy 0.85 4000 macro avg 0.86 0.85 0.85 4000 weighted avg 0.86 0.85 0.85 4000
# Logistic Regression Classifier
n_lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
n_lr_clf.fit(n_X_train, n_y_train)
n_y_pred = n_lr_clf.predict(n_X_test)
n_lr_accuracy = accuracy_score(n_y_test, n_y_pred)
print("Logistic Regression Accuracy:", n_lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(n_y_test, n_y_pred, target_names=label_mapping.keys()))
Logistic Regression Accuracy: 0.85075 Logistic Regression Classification Report: precision recall f1-score support nl 0.88 0.81 0.85 2019 xqc 0.82 0.89 0.86 1981 accuracy 0.85 4000 macro avg 0.85 0.85 0.85 4000 weighted avg 0.85 0.85 0.85 4000
Classification, Tf-idf Character-Level¶
# Vectorize with character n-grams
c_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
c_features = c_vectorizer.fit_transform(data['text']).toarray()
c_labels = data['label_encoded'].values
# Split data
c_X_train, c_X_test, c_y_train, c_y_test = train_test_split(c_features, c_labels, test_size=0.2, random_state=42)
# Naive Bayes Classifier
c_nb_clf = MultinomialNB()
c_nb_clf.fit(c_X_train, c_y_train)
c_y_pred = c_nb_clf.predict(c_X_test)
c_nb_accuracy = accuracy_score(c_y_test, c_y_pred)
print("Naive Bayes Accuracy:", c_nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(c_y_test, c_y_pred, target_names=label_mapping.keys()))
Naive Bayes Accuracy: 0.84875 Naive Bayes Classification Report: precision recall f1-score support nl 0.85 0.85 0.85 2019 xqc 0.84 0.85 0.85 1981 accuracy 0.85 4000 macro avg 0.85 0.85 0.85 4000 weighted avg 0.85 0.85 0.85 4000
# Logistic Regression Classifier
c_lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
c_lr_clf.fit(c_X_train, c_y_train)
c_y_pred = c_lr_clf.predict(c_X_test)
c_lr_accuracy = accuracy_score(c_y_test, c_y_pred)
print("Logistic Regression Accuracy:", c_lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(c_y_test, c_y_pred, target_names=label_mapping.keys()))
Logistic Regression Accuracy: 0.8625 Logistic Regression Classification Report: precision recall f1-score support nl 0.87 0.85 0.86 2019 xqc 0.85 0.87 0.86 1981 accuracy 0.86 4000 macro avg 0.86 0.86 0.86 4000 weighted avg 0.86 0.86 0.86 4000
Less Preprocessing Classification¶
lp_data = load_data_from_directory('txts')
lp_data = downsize_dataframe(lp_data, 'label', 10000)
lp_data['text'] = weak_preprocess_text(lp_data['text'])
lp_label_mapping = {label: idx for idx, label in enumerate(lp_data['label'].unique())}
lp_data['label_encoded'] = lp_data['label'].map(lp_label_mapping)
# Wordclouds for LP in 1x2 grid
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.axis('off')
wc = WordCloud().generate(' '.join(lp_data[lp_data['label'] == 'nl']['text']))
plt.imshow(wc)
plt.title('Northernlion')
plt.subplot(1, 2, 2)
plt.axis('off')
wc = WordCloud().generate(' '.join(lp_data[lp_data['label'] == 'xqc']['text']))
plt.imshow(wc)
plt.title('xQc')
plt.tight_layout()
plt.show()
# Get Overall Chat Similarity
lp_nl_words = ' '.join(lp_data[lp_data['label'] == 'nl']['text']).split(' ')
lp_xqc_words = ' '.join(lp_data[lp_data['label'] == 'xqc']['text']).split(' ')
print(f"Less Preprocessing Jaccard Similarity, NL vs xQc: {jaccard_similarity(lp_xqc_words, lp_nl_words)}")
print(f"Less Preprocessing Cosine Similarity, NL vs xQc: {cosine_sim(lp_xqc_words, lp_nl_words)}")
Less Preprocessing Jaccard Similarity, NL vs xQc: 0.13593539703903096 Less Preprocessing Cosine Similarity, NL vs xQc: 0.35153146875446284
# Similarity Between Less Preprocessed vs Normal
print(f"Jaccard Similarity, Less Preprocessed vs Normal, NL: {jaccard_similarity(lp_nl_words, nl_words)}")
print(f"Cosine Similarity, Less Preprocessed vs Normal, NL: {cosine_sim(lp_nl_words, nl_words)}")
print(f"Jaccard Similarity, Less Preprocessed vs Normal, xQc: {jaccard_similarity(lp_xqc_words, xqc_words)}")
print(f"Cosine Similarity, Less Preprocessed vs Normal, xQc: {cosine_sim(lp_xqc_words, xqc_words)}")
Jaccard Similarity, Less Preprocessed vs Normal, NL: 0.24566871390594922 Cosine Similarity, Less Preprocessed vs Normal, NL: 0.9861841344801694 Jaccard Similarity, Less Preprocessed vs Normal, xQc: 0.18620356970574048 Cosine Similarity, Less Preprocessed vs Normal, xQc: 0.9787615415988046
# Bar Graph of Similarities, Normal vs Weak Preprocessing
jaccard_similarities = [jaccard_similarity(xqc_words, nl_words), jaccard_similarity(lp_xqc_words, lp_nl_words)]
cosine_similarities = [cosine_sim(xqc_words, nl_words), cosine_sim(lp_xqc_words, lp_nl_words)]
bar_width = 0.4
index = np.arange(2)
rects1 = plt.bar(index, jaccard_similarities, bar_width, color='b', label='Jaccard')
rects2 = plt.bar(index + bar_width, cosine_similarities, bar_width, color='r', label='Cosine')
plt.title('Similarity Metrics from Preprocessing Methods')
plt.xticks(index + bar_width / 2, ('Normal', 'Weak (Keeps Punctuation, Caps)'))
plt.legend()
plt.tight_layout()
plt.show()
Basic¶
# Basic TF-IDF
lp_vectorizer = TfidfVectorizer(max_features=5000)
lp_features = lp_vectorizer.fit_transform(lp_data['text']).toarray()
lp_labels = lp_data['label_encoded'].values
lp_X_train, lp_X_test, lp_y_train, lp_y_test = train_test_split(lp_features, lp_labels, test_size=0.2, random_state=42)
# Naive Bayes Classifier
lp_nb_clf = MultinomialNB()
lp_nb_clf.fit(lp_X_train, lp_y_train)
lp_y_pred = lp_nb_clf.predict(lp_X_test)
lp_nb_accuracy = accuracy_score(lp_y_test, lp_y_pred)
print("Naive Bayes Accuracy:", lp_nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(lp_y_test, lp_y_pred, target_names=lp_label_mapping.keys()))
Naive Bayes Accuracy: 0.8355 Naive Bayes Classification Report: precision recall f1-score support nl 0.87 0.79 0.83 2019 xqc 0.81 0.88 0.84 1981 accuracy 0.84 4000 macro avg 0.84 0.84 0.84 4000 weighted avg 0.84 0.84 0.84 4000
# Logistic Regression Classifier
lp_lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
lp_lr_clf.fit(lp_X_train, lp_y_train)
lp_y_pred = lp_lr_clf.predict(lp_X_test)
lp_lr_accuracy = accuracy_score(lp_y_test, lp_y_pred)
print("Logistic Regression Accuracy:", lp_lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(lp_y_test, lp_y_pred, target_names=lp_label_mapping.keys()))
Logistic Regression Accuracy: 0.83575 Logistic Regression Classification Report: precision recall f1-score support nl 0.87 0.79 0.83 2019 xqc 0.81 0.88 0.84 1981 accuracy 0.84 4000 macro avg 0.84 0.84 0.84 4000 weighted avg 0.84 0.84 0.84 4000
Character n-grams¶
# Character n-grams
lp_c_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
lp_c_features = lp_c_vectorizer.fit_transform(lp_data['text']).toarray()
lp_c_labels = lp_data['label_encoded'].values
lp_c_X_train, lp_c_X_test, lp_c_y_train, lp_c_y_test = train_test_split(lp_c_features, lp_c_labels, test_size=0.2, random_state=42)
# NB
lp_c_nb_clf = MultinomialNB()
lp_c_nb_clf.fit(lp_c_X_train, lp_c_y_train)
lp_c_y_pred = lp_c_nb_clf.predict(lp_c_X_test)
lp_c_nb_accuracy = accuracy_score(lp_c_y_test, lp_c_y_pred)
print("Naive Bayes Accuracy:", lp_c_nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(lp_c_y_test, lp_c_y_pred, target_names=lp_label_mapping.keys()))
Naive Bayes Accuracy: 0.84575 Naive Bayes Classification Report: precision recall f1-score support nl 0.84 0.86 0.85 2019 xqc 0.85 0.84 0.84 1981 accuracy 0.85 4000 macro avg 0.85 0.85 0.85 4000 weighted avg 0.85 0.85 0.85 4000
# LR
lp_c_lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
lp_c_lr_clf.fit(lp_c_X_train, lp_c_y_train)
lp_c_y_pred = lp_c_lr_clf.predict(lp_c_X_test)
lp_c_lr_accuracy = accuracy_score(lp_c_y_test, lp_c_y_pred)
print("Logistic Regression Accuracy:", lp_c_lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(lp_c_y_test, lp_c_y_pred, target_names=lp_label_mapping.keys()))
Logistic Regression Accuracy: 0.851 Logistic Regression Classification Report: precision recall f1-score support nl 0.86 0.84 0.85 2019 xqc 0.84 0.86 0.85 1981 accuracy 0.85 4000 macro avg 0.85 0.85 0.85 4000 weighted avg 0.85 0.85 0.85 4000
KMeans Clustering¶
# Using lp_c_features (Less Processed, Character N-Grams)
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(lp_c_features)
# Assign cluster labels to each data point
cluster_assignments = kmeans.labels_
# Map clusters to true labels using a manual count
cluster_to_label = {}
for cluster in range(2):
cluster_indices = np.where(cluster_assignments == cluster)[0] # Indices of points in this cluster
cluster_labels = lp_c_labels[cluster_indices] # True labels for these points
# Count label occurrences in the cluster
label_counts = Counter(cluster_labels)
# Assign the most common label to the cluster
if label_counts:
cluster_to_label[cluster] = max(label_counts, key=label_counts.get)
# Convert cluster assignments into predicted labels
km_y_pred = np.array([cluster_to_label[cluster] for cluster in cluster_assignments])
# Evaluate clustering performance
km_accuracy = accuracy_score(lp_c_labels, km_y_pred)
print("KMeans Accuracy:", km_accuracy)
print("KMeans Classification Report:\n", classification_report(lp_c_labels, km_y_pred, target_names=lp_label_mapping.keys()))
Evaluate Techniques¶
# Bar Graph of Model Accuracies across Methods
accuracies = [nb_accuracy, lr_accuracy,
n_nb_accuracy, n_lr_accuracy,
c_nb_accuracy, c_lr_accuracy,
lp_nb_accuracy, lp_lr_accuracy,
lp_c_nb_accuracy, lp_c_lr_accuracy]
nb_accuracies = [nb_accuracy, n_nb_accuracy, c_nb_accuracy, lp_nb_accuracy, lp_c_nb_accuracy]
lr_accuracies = [lr_accuracy, n_lr_accuracy, c_lr_accuracy, lp_lr_accuracy, lp_c_lr_accuracy]
bar_width = 0.4
index = np.arange(len(nb_accuracies))
rects1 = plt.bar(index, nb_accuracies, bar_width, color='b', label='Naive Bayes')
rects2 = plt.bar(index + bar_width, lr_accuracies, bar_width, color='r', label='Logistic Regression')
plt.title('Model Accuracies')
plt.xticks(index + bar_width / 2, ['Standard', 'N-Grams', 'Char N-Grams', 'Weak P.P.', 'Weak P.P. c-N-Grams'], fontsize=8)
plt.legend()
plt.ylim(min(accuracies) - 0.01, max(accuracies) + 0.01)
plt.show()
Examine Successful, Failed Predictions¶
# Make predictions on full data set using weak preprocessing and character n-grams on naive bayes
full_labels_pred = lp_c_nb_clf.predict(lp_c_features)
# Evaluate LR on full data set
# For sanity, to check for extreme overfitting
print("Full Data NB Accuracy:", accuracy_score(lp_c_labels, full_labels_pred))
print("Full Data NB Classification Report:\n", classification_report(lp_c_labels, full_labels_pred, target_names=lp_label_mapping.keys()))
Full Data NB Accuracy: 0.87315 Full Data NB Classification Report: precision recall f1-score support nl 0.87 0.88 0.87 10000 xqc 0.88 0.87 0.87 10000 accuracy 0.87 20000 macro avg 0.87 0.87 0.87 20000 weighted avg 0.87 0.87 0.87 20000
# Get all successful predictions
successful_predictions = data[data['label_encoded'] == full_labels_pred]
# Seperate successful predictions by streamer
nl_successful_predictions = successful_predictions[successful_predictions['label'] == 'nl']
xqc_successful_predictions = successful_predictions[successful_predictions['label'] == 'xqc']
# Get all failed predictions
failed_predictions = data[data['label_encoded'] != full_labels_pred]
# Seperate failed predictions by streamer
xqc_failed_predictions = failed_predictions[failed_predictions['label'] == 'xqc']
nl_failed_predictions = failed_predictions[failed_predictions['label'] == 'nl']
# 2x2 grid of word clouds for successful and failed predictions by streamer
fig, ax = plt.subplots(2, 2, figsize=(14, 7))
ax[0, 0].imshow(WordCloud().generate(' '.join(nl_successful_predictions['text'])))
ax[0, 0].set_title('NL Predicted as NL')
ax[0, 0].axis('off')
ax[0, 1].imshow(WordCloud().generate(' '.join(nl_failed_predictions['text'])))
ax[0, 1].set_title('NL Predicted as xQc')
ax[0, 1].axis('off')
ax[1, 0].imshow(WordCloud().generate(' '.join(xqc_successful_predictions['text'])))
ax[1, 0].set_title('xQc Predicted as xQc')
ax[1, 0].axis('off')
ax[1, 1].imshow(WordCloud().generate(' '.join(xqc_failed_predictions['text'])))
ax[1, 1].set_title('xQc Predicted as NL')
ax[1, 1].axis('off')
plt.tight_layout()
plt.show()
# Get statistics of failed predictions vs full dataset
# Get chat lengths
all_chat_lengths = data['text'].str.len()
failed_chat_lengths = failed_predictions['text'].str.len()
# Box and Whisker Plots of lengths
fig, ax = plt.subplots(1, 2, figsize=(12, 7))
ax[0].boxplot([all_chat_lengths, failed_chat_lengths], tick_labels=['All Chats', 'Failed Predictions'])
ax[0].set_title('Chat Character Limit')
ax[1].boxplot([all_chat_lengths, failed_chat_lengths], tick_labels=['All Chats', 'Failed Predictions'])
ax[1].set_ylim(-2, 52)
ax[1].set_title('Chat Character Limit, Zoomed In')
plt.tight_layout()
plt.show()
Predictions succeed and fail independent of chat length from this view. For the messages falsely predicted, it comes down to standard, shared emotes and noise. For Jaccard Similarity at around .15, accuracy at around .85 is a strong result.
Expensive Model¶
# # SVM Classifier
# svm = SVC(probability=True)
# svm.fit(lp_c_X_train, lp_c_y_train)
# svm_y_pred = svm.predict(lp_c_X_test)
# svm_accuracy = accuracy_score(lp_c_y_test, svm_y_pred)
# print("SVM Accuracy:", svm_accuracy)
# print("SVM Classification Report:\n", classification_report(lp_c_y_test, svm_y_pred, target_names=lp_label_mapping.keys()))
SVM took over 2hrs before it was manually stopped. Further improvements are unlikely, due to the bodies of chat messages having overlapping similarites, proven by the jaccard and cosine similarities found earlier.
More Pairwise Examinations¶
Pairwise, binary classification is expaned to the 10 possible combinations between 5 streamers. Expanding on NL and xQc, TommyInnit and JackManifoldTV are examined as two streamers with nearly identical communities, both streaming minecraft to a similar audience (the two are, as of this moment in early 2025, known for the podcast they host together). Ibai is brought in as incredibly different from the other 4, as he is a Spanish Streamer with a Spanish speaking chat. The 5 will cover the extremes in binary classification of twitch chat.
Similarities and NB Accuracies (Using Weak-Preprocessing and Character n-grams, as the method logically makes sense for Twitch Chat and the models performed well) are calculated for all the pairs, and a network graph is created using the granular cosine_sim to visualize the differences between the chats.
# Create Results Dataframe
cols = ["label1", "label2", "accuracy", "jaccard_sim", "cosine_sim"]
pairwise_results = pd.DataFrame(columns=cols)
# Set Pairs
l1 = ["nl", "nl", "nl", "nl", "xqc", "xqc", "xqc", "ibai", "ibai", "tommy"]
l2 = ["xqc", "ibai", "tommy", "jack", "ibai", "tommy", "jack", "tommy", "jack", "jack"]
pairwise_results["label1"] = l1
pairwise_results["label2"] = l2
# List of vods
vods_dict = {
"xqc": "2362597129", # React, Streamer Life Sim 1/24/25
"nl": "2362321171", # Playing Puck, Jackbox 1/24/25
"ibai": "2351403650", # Live WC Soccer React
"tommy": "2328457566", # Joining TubboSMP
"jack": "2352536292" # Tubbo SMP (More Recently)
}
# Download Chats from vods_dict
# for key in vods_dict:
# download_chat(vods_dict[key], f"pairs_txts/{key}.txt")
# load
pairs_data = load_data_from_directory('pairs_txts')
# downsize to 5000
pairs_data = downsize_dataframe(pairs_data, 'label', 5000)
# Preprocess text (weak)
pairs_data['text'] = weak_preprocess_text(pairs_data['text'])
# Encode
pairs_label_mapping = {label: idx for idx, label in enumerate(pairs_data['label'].unique())}
pairs_data['label_encoded'] = pairs_data['label'].map(pairs_label_mapping)
# 2x3 Plots for Wordclouds for all 5 streamers
labels = ['nl', 'xqc', 'ibai', 'tommy', 'jack']
titles = ['Northernlion', 'xQc', 'Ibai', 'TommyInnit', 'JackManifoldTV']
plt.figure(figsize=(12, 5))
for i, (label, title) in enumerate(zip(labels, titles), 1):
plt.subplot(2, 3, i)
plt.axis('off')
wc = WordCloud().generate(' '.join(pairs_data[pairs_data['label'] == label]['text']))
plt.imshow(wc)
plt.title(title)
plt.tight_layout()
plt.show()
# Generate similarities for pairwise_results
for i in range(len(pairwise_results)):
l1 = pairwise_results.iloc[i]["label1"]
l2 = pairwise_results.iloc[i]["label2"]
l1_words = ' '.join(pairs_data[pairs_data['label'] == l1]['text']).split(' ')
l2_words = ' '.join(pairs_data[pairs_data['label'] == l2]['text']).split(' ')
pairwise_results.iloc[i, 3] = jaccard_similarity(l1_words, l2_words)
pairwise_results.iloc[i, 4] = cosine_sim(l1_words, l2_words)
pairwise_results
label1 | label2 | accuracy | jaccard_sim | cosine_sim | |
---|---|---|---|---|---|
0 | nl | xqc | NaN | 0.134792 | 0.331596 |
1 | nl | ibai | NaN | 0.028665 | 0.028063 |
2 | nl | tommy | NaN | 0.122396 | 0.494157 |
3 | nl | jack | NaN | 0.132297 | 0.543763 |
4 | xqc | ibai | NaN | 0.032769 | 0.038524 |
5 | xqc | tommy | NaN | 0.126143 | 0.37855 |
6 | xqc | jack | NaN | 0.131862 | 0.355978 |
7 | ibai | tommy | NaN | 0.025549 | 0.052179 |
8 | ibai | jack | NaN | 0.026887 | 0.036467 |
9 | tommy | jack | NaN | 0.172311 | 0.695233 |
# Plot jaccard similarity vs cosine similarity
plt.figure(figsize=(4, 3))
plt.scatter(pairwise_results['jaccard_sim'], pairwise_results['cosine_sim'], c='r')
plt.xlabel('Jaccard Similarity')
plt.ylabel('Cosine Similarity')
plt.title('Jaccard Similarity vs Cosine Similarity')
# calculate correlation
correlation = pairwise_results['jaccard_sim'].corr(pairwise_results['cosine_sim'])
plt.text(.025, 0.675, f'Correlation: {correlation:.2f}', fontsize=8)
plt.show()
# Vectorize with character n-grams
pairs_c_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
pairs_c_features = pairs_c_vectorizer.fit_transform(pairs_data['text']).toarray()
pairs_c_labels = pairs_data['label_encoded'].values
# For each pair, split data, train and evaluate NB classifier and get accuracy
for i in range(len(pairwise_results)):
l1 = pairwise_results.iloc[i]["label1"]
l2 = pairwise_results.iloc[i]["label2"]
pair_features = pairs_c_features[(pairs_data['label'] == l1) | (pairs_data['label'] == l2)]
pair_labels = pairs_c_labels[(pairs_data['label'] == l1) | (pairs_data['label'] == l2)]
pair_X_train, pair_X_test, pair_y_train, pair_y_test = train_test_split(pair_features, pair_labels, test_size=0.2, random_state=42)
pair_nb_clf = MultinomialNB()
pair_nb_clf.fit(pair_X_train, pair_y_train)
pair_y_pred = pair_nb_clf.predict(pair_X_test)
pair_accuracy = accuracy_score(pair_y_test, pair_y_pred)
pairwise_results.iloc[i, 2] = pair_accuracy
pairwise_results
label1 | label2 | accuracy | jaccard_sim | cosine_sim | |
---|---|---|---|---|---|
0 | nl | xqc | 0.836 | 0.134792 | 0.331596 |
1 | nl | ibai | 0.9355 | 0.028665 | 0.028063 |
2 | nl | tommy | 0.8245 | 0.122396 | 0.494157 |
3 | nl | jack | 0.8345 | 0.132297 | 0.543763 |
4 | xqc | ibai | 0.9275 | 0.032769 | 0.038524 |
5 | xqc | tommy | 0.8375 | 0.126143 | 0.37855 |
6 | xqc | jack | 0.856 | 0.131862 | 0.355978 |
7 | ibai | tommy | 0.9355 | 0.025549 | 0.052179 |
8 | ibai | jack | 0.9425 | 0.026887 | 0.036467 |
9 | tommy | jack | 0.7085 | 0.172311 | 0.695233 |
# Plot accuracy vs jaccard similarity, accuracy vs cosine similarity
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(pairwise_results['jaccard_sim'], pairwise_results['accuracy'])
plt.xlabel('Jaccard Similarity')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Jaccard Similarity')
plt.subplot(1, 2, 2)
plt.scatter(pairwise_results['cosine_sim'], pairwise_results['accuracy'])
plt.xlabel('Cosine Similarity')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Cosine Similarity')
plt.tight_layout()
plt.show()
# Make network graph based off of pairwise similarities
G = nx.Graph()
G.add_nodes_from(pairwise_results["label1"].unique())
for i in range(len(pairwise_results)):
distance = 1 / (2 ** (pairwise_results.iloc[i]["cosine_sim"] * 10))
G.add_edge(pairwise_results.iloc[i]["label1"], pairwise_results.iloc[i]["label2"], weight=1/distance)
pos = nx.spring_layout(G, weight='weight', k=10, iterations=5000)
plt.figure(figsize=(8, 6))
plt.title('Cosine Similarity Between Streamer Chats Network Graph')
nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=1500, font_weight='bold')
# Draw edge labels (similarities)
edge_labels = {(u, v): f'{(np.log2(d["weight"]) / 10):.3f}' for u, v, d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
plt.show()
Multiple Streamer Proof of Concept¶
Multiclass Classification is tested using 4 of the 5 streamers from the previous step (jack is removed to eliminate the tommy/jack overlap for this proof of concept). A Naive Bayes classifier is created, and likely words (words typical of that streamer and not any of the other streamers) are tested. The process is repeated for cheaper parameters that are likely needed for an expansion into full application.
Accuracy takes a significant hit cutting messages by a fifth and vectorizer features by a fourth, though the classifier's ability to work with classic chat messages from each streamer is not heavily diminished.
Standard¶
# List of vods for proof of concept (removing jack to eliminate tommy/jack overlap)
poc_vods_dict = {
"xqc": "2362597129", # React, Streamer Life Sim 1/24/25
"nl": "2362321171", # Playing Puck, Jackbox 1/24/25
"ibai": "2351403650", # Live WC Soccer React
"tommy": "2328457566", # Joining TubboSMP
}
# Download Chats from vods_dict
# for key in poc_vods_dict:
# download_chat(poc_vods_dict[key], f"poc_txts/{key}.txt")
# load
poc_data = load_data_from_directory('poc_txts')
# downsize to 5000
poc_data = downsize_dataframe(poc_data, 'label', 5000)
# Preprocess text (weak)
poc_data['text'] = weak_preprocess_text(poc_data['text'])
# Encode
poc_label_mapping = {label: idx for idx, label in enumerate(poc_data['label'].unique())}
poc_data['label_encoded'] = poc_data['label'].map(poc_label_mapping)
# Vectorize with character n-grams
poc_c_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
poc_c_features = poc_c_vectorizer.fit_transform(poc_data['text']).toarray()
poc_c_labels = poc_data['label_encoded'].values
# Split data
poc_X_train, poc_X_test, poc_y_train, poc_y_test = train_test_split(poc_c_features, poc_c_labels, test_size=0.2, random_state=42)
# NB Classifier
poc_nb_clf = MultinomialNB()
poc_nb_clf.fit(poc_X_train, poc_y_train)
# Evaluate NB Classifier
poc_y_pred = poc_nb_clf.predict(poc_X_test)
poc_accuracy = accuracy_score(poc_y_test, poc_y_pred)
print("NB Accuracy:", poc_accuracy)
print("NB Classification Report:\n", classification_report(poc_y_test, poc_y_pred, target_names=poc_label_mapping.keys()))
NB Accuracy: 0.76825 NB Classification Report: precision recall f1-score support ibai 0.88 0.88 0.88 995 nl 0.76 0.72 0.74 1024 tommy 0.70 0.76 0.73 1004 xqc 0.73 0.71 0.72 977 accuracy 0.77 4000 macro avg 0.77 0.77 0.77 4000 weighted avg 0.77 0.77 0.77 4000
# Likely words dictionary (manually defined based on personal understanding of respective chat)
likely_words = {
"xqc": "omE",
"nl": "Cereal",
"ibai": "que",
"tommy": "GayPride"
}
# predict likely words
for key in likely_words:
print(f"{key} ({likely_words[key]}) -> {predict_word(likely_words[key], poc_nb_clf, poc_c_vectorizer, poc_label_mapping)}")
xqc (omE) -> xqc nl (Cereal) -> nl ibai (que) -> ibai tommy (GayPride) -> tommy
# Get likely words with confidence
for key in likely_words:
print(f"{key} ({likely_words[key]}) -> {predict_word_confidence(likely_words[key], poc_nb_clf, poc_c_vectorizer, poc_label_mapping)}")
xqc (omE) -> ('xqc', 0.985383877205551) nl (Cereal) -> ('nl', 0.9997923678429135) ibai (que) -> ('ibai', 0.9993385561171404) tommy (GayPride) -> ('tommy', 0.9937931223214658)
for key in likely_words:
print(f"{key} ({likely_words[key]}) -> {get_confidence_for_all_labels(likely_words[key], poc_nb_clf, poc_c_vectorizer, poc_label_mapping)}")
xqc (omE) -> {'ibai': 0.00039032212695792027, 'nl': 0.0014938117427326183, 'tommy': 0.012731988924757824, 'xqc': 0.985383877205551} nl (Cereal) -> {'ibai': 4.7077612713758734e-05, 'nl': 0.9997923678429135, 'tommy': 6.909077060448049e-05, 'xqc': 9.146377377103007e-05} ibai (que) -> {'ibai': 0.9993385561171404, 'nl': 0.0002265670485358673, 'tommy': 0.00010158189175076548, 'xqc': 0.00033329494257399746} tommy (GayPride) -> {'ibai': 0.0011773986891216652, 'nl': 0.003537630939901814, 'tommy': 0.9937931223214658, 'xqc': 0.0014918480495116312}
Cheaper Parameters, Model¶
# 5000 -> 1000 chat messages per streamer, 20000 -> 5000 max features
# load
cheap_poc_data = load_data_from_directory('poc_txts')
# 5000 -> 1000 chat messages per streamer
cheap_poc_data = downsize_dataframe(cheap_poc_data, 'label', 1000)
# Preprocess text (weak)
cheap_poc_data['text'] = weak_preprocess_text(cheap_poc_data['text'])
# Encode
cheap_poc_label_mapping = {label: idx for idx, label in enumerate(cheap_poc_data['label'].unique())}
cheap_poc_data['label_encoded'] = cheap_poc_data['label'].map(cheap_poc_label_mapping)
# Vectorize with character n-grams, 20000 -> 5000 max features
cheap_poc_vectorizer = TfidfVectorizer(max_features=5000, analyzer='char_wb', ngram_range=(1, 5))
cheap_poc_features = cheap_poc_vectorizer.fit_transform(cheap_poc_data['text']).toarray()
cheap_poc_labels = cheap_poc_data['label_encoded'].values
# Split data
cheap_poc_X_train, cheap_poc_X_test, cheap_poc_y_train, cheap_poc_y_test = train_test_split(cheap_poc_features, cheap_poc_labels, test_size=0.2, random_state=42)
# NB Classifier
cheap_poc_nb_clf = MultinomialNB()
cheap_poc_nb_clf.fit(cheap_poc_X_train, cheap_poc_y_train)
# Evaluate NB Classifier
cheap_poc_y_pred = cheap_poc_nb_clf.predict(cheap_poc_X_test)
cheap_poc_accuracy = accuracy_score(cheap_poc_y_test, cheap_poc_y_pred)
print("NB Accuracy:", cheap_poc_accuracy)
print("NB Classification Report:\n", classification_report(cheap_poc_y_test, cheap_poc_y_pred, target_names=cheap_poc_label_mapping.keys()))
NB Accuracy: 0.69875 NB Classification Report: precision recall f1-score support ibai 0.83 0.84 0.83 209 nl 0.70 0.60 0.65 213 tommy 0.63 0.65 0.64 194 xqc 0.63 0.70 0.66 184 accuracy 0.70 800 macro avg 0.70 0.70 0.70 800 weighted avg 0.70 0.70 0.70 800
# Get likely words with confidence
for key in likely_words:
print(f"{key} ({likely_words[key]}) -> {predict_word_confidence(likely_words[key], cheap_poc_nb_clf, cheap_poc_vectorizer, cheap_poc_label_mapping)}")
xqc (omE) -> ('xqc', 0.895962644027219) nl (Cereal) -> ('nl', 0.9941023736550885) ibai (que) -> ('ibai', 0.9944629253379634) tommy (GayPride) -> ('tommy', 0.9417083636400793)
for key in likely_words:
print(f"{key} ({likely_words[key]}) -> {get_confidence_for_all_labels(likely_words[key], cheap_poc_nb_clf, cheap_poc_vectorizer, cheap_poc_label_mapping)}")
xqc (omE) -> {'ibai': 0.009010011324352836, 'nl': 0.018529435312124486, 'tommy': 0.07649790933630353, 'xqc': 0.895962644027219} nl (Cereal) -> {'ibai': 0.0010749494167538163, 'nl': 0.9941023736550885, 'tommy': 0.0013668750933012144, 'xqc': 0.0034558018348584347} ibai (que) -> {'ibai': 0.9944629253379634, 'nl': 0.0022253504209979354, 'tommy': 0.0010937822202266318, 'xqc': 0.0022179420208105023} tommy (GayPride) -> {'ibai': 0.01699206086863734, 'nl': 0.025082190675351754, 'tommy': 0.9417083636400793, 'xqc': 0.016217384815931157}
Multiple Streamer Classification¶
A more complete implementation is tested.
Top 100 English Channels¶
First, we take the Top 100 English Streamers by Avg. Viewers in the last 30 Days from TwitchTracker.com, automatically get their most recent vod, and download the chat from there. This process was not a complete failure, though there were critical failures, shown by the wordclouds. Using most recent vods can be innacurate to a chat's actual, typical behavior. The obtained Ludwig Vod was mostly his chat playing Pokemon by inputting actions, which is not the chat experience representative of a Ludwig stream. The same can be said for GamesDoneQuick, which had more grandma talk than expected. Also, in the top100 list, certain non-personalities were included, such as Esports Broadcasting channels (eg. ow_esports, or Overwatch Esports). These channels don't really have a personality to reflect, and don't come with a chat worth investigation. Chats for esports broadcasts are not made part of the viewership experience for those watching, for the most part.
# Top 100 English Speaking Streamers of January 2025, from TwitchTracker.com
top100streamers = ["Caedrel", "zackrawrr", "KaiCenat", "caseoh_", "GamesDoneQuick",
"HasanAbi", "BLASTPremier", "sodapoppin", "plaqueboymax", "Jynxzi",
"ohnePixel", "TimTheTatman", "xQc", "loltyler1", "shroud",
"PirateSoftware", "tarik", "stableronaldo", "sinatraa", "Pikabooirl",
"summit1g", "easportsfc", "jasontheween", "Thebausffs", "yourragegaming",
"erobb221", "Mizkif", "Lacy", "Xaryu", "Quin69",
"ESLCS", "Necros", "LCK", "LEC", "vedal987",
"Ludwig", "Clix", "fissure_dota_en", "forsen", "Agent00",
"NiceWigg", "k3soju", "Velcuz", "Emiru", "MOONMOON",
"Twitch", "Gorgc", "LIRIK", "Warframe", "Lord_Kebun",
"ironmouse", "RocketLeague", "VALORANT_EMEA", "Elajjaz", "ow_esports",
"playapex", "ESLCSb", "LTANorth", "Northernlion", "Valkyrae",
"Rekkles", "angryginge13", "supertf", "Nmplol", "MrSavage",
"Ziqoftw", "RDCgaming", "CohhCarnage", "LVNDMARK", "DonaldTrump",
"VALORANT_Americas", "Fanum", "shanks_ttv", "Nadeshot", "PaymoneyWubby",
"AuzioMF", "runthefutmarket", "2xRaKai", "WarThunder_Esports", "Adapt",
"ExtraEmily", "mooda", "Simurgh", "HLTVorg", "Grubby",
"Jerma985", "Ninja", "Unboxholics", "AussieAntics", "Silky",
"BarbarousKing", "AdmiralBahroo", "chess24", "Zentreya", "PENTA",
"PlayHearthstone", "Psychoghost", "CDawgVA", "Glorious_E", "lol_nemesis"]
# # Get vod ids for top 100 streamers
# top100_vod_id = {}
# for streamer in top100streamers:
# user_id = get_user_id(streamer)
# if user_id:
# vod_ids = user_vod_ids(user_id)
# # delay to avoid rate-limiting
# time.sleep(.2)
# if vod_ids:
# top100_vod_id[streamer] = vod_ids[0]
# else:
# print(f"No vods found for {streamer}")
# else:
# print(f"No user found for {streamer}")
# download chats for top 100 streamers
# for streamer in top100_vod_id:
# download_chat(top100_vod_id[streamer], f"t100/{streamer}.txt")
# load data
t100_data = load_data_from_directory('t100')
# get count of each label, print all labels with count < 1000
print("Labels with count < 1000:")
label_counts = t100_data['label'].value_counts()
for label in label_counts.index:
if label_counts[label] < 1000:
print(f"{label}: {label_counts[label]}")
Labels with count < 1000: HasanAbi: 802 Rekkles: 689 summit1g: 414
# downsize to 1000
t100_data = downsize_dataframe(t100_data, 'label', 1000)
# Preprocess text (weak)
t100_data['text'] = weak_preprocess_text(t100_data['text'])
# Encode
t100_label_mapping = {label: idx for idx, label in enumerate(t100_data['label'].unique())}
t100_data['label_encoded'] = t100_data['label'].map(t100_label_mapping)
# Vectorize with character n-grams
t100_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
t100_features = t100_vectorizer.fit_transform(t100_data['text']).toarray()
t100_labels = t100_data['label_encoded'].values
# Split data
t100_X_train, t100_X_test, t100_y_train, t100_y_test = train_test_split(t100_features, t100_labels, test_size=0.2, random_state=42)
# NB Classifier
t100_nb_clf = MultinomialNB()
t100_nb_clf.fit(t100_X_train, t100_y_train)
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
# Evaluate NB Classifier
t100_y_pred = t100_nb_clf.predict(t100_X_test)
t100_accuracy = accuracy_score(t100_y_test, t100_y_pred)
print("NB Accuracy:", t100_accuracy)
print("NB Classification Report:\n", classification_report(t100_y_test, t100_y_pred, target_names=t100_label_mapping.keys()))
NB Accuracy: 0.23883009227881757 NB Classification Report: precision recall f1-score support Adapt 0.24 0.06 0.09 195 AdmiralBahroo 0.21 0.36 0.27 187 Agent00 0.30 0.08 0.13 214 AussieAntics 0.08 0.40 0.13 195 AuzioMF 0.34 0.28 0.30 174 BLASTPremier 0.38 0.25 0.30 179 BarbarousKing 0.29 0.15 0.19 220 CDawgVA 0.33 0.20 0.25 196 Caedrel 0.55 0.23 0.33 212 Clix 0.36 0.09 0.14 201 CohhCarnage 0.34 0.52 0.42 197 ESLCS 0.26 0.38 0.31 203 ESLCSb 0.19 0.13 0.15 212 Elajjaz 0.43 0.26 0.33 187 Emiru 0.27 0.21 0.24 194 ExtraEmily 0.32 0.26 0.29 184 Fanum 0.33 0.17 0.23 210 GamesDoneQuick 0.31 0.33 0.32 193 Glorious_E 0.30 0.20 0.24 192 Gorgc 0.31 0.16 0.21 202 Grubby 0.10 0.43 0.16 195 HLTVorg 0.41 0.30 0.35 212 HasanAbi 0.44 0.37 0.40 150 Jerma985 0.20 0.31 0.24 208 Jynxzi 0.38 0.21 0.27 206 KaiCenat 0.29 0.14 0.18 185 LCK 0.16 0.34 0.22 180 LEC 0.24 0.10 0.14 194 LIRIK 0.19 0.13 0.15 205 LTANorth 0.44 0.15 0.22 193 LVNDMARK 0.15 0.25 0.19 208 Lacy 0.20 0.10 0.13 188 Lord_Kebun 0.21 0.23 0.22 220 Ludwig 0.40 0.47 0.43 191 MOONMOON 0.30 0.16 0.21 218 Mizkif 0.17 0.11 0.13 199 MrSavage 0.37 0.21 0.26 218 Nadeshot 0.17 0.21 0.19 214 Necros 0.37 0.28 0.31 206 NiceWigg 0.34 0.20 0.26 201 Ninja 0.19 0.19 0.19 206 Nmplol 0.20 0.16 0.18 218 Northernlion 0.25 0.18 0.21 219 PENTA 0.41 0.22 0.28 199 PaymoneyWubby 0.38 0.21 0.27 187 Pikabooirl 0.16 0.05 0.08 203 PirateSoftware 0.13 0.19 0.15 208 PlayHearthstone 0.37 0.26 0.30 192 Psychoghost 0.06 0.56 0.11 187 Quin69 0.30 0.41 0.35 192 RDCgaming 0.62 0.27 0.38 199 Rekkles 0.66 0.28 0.39 160 RocketLeague 0.33 0.34 0.34 196 Silky 0.20 0.24 0.22 226 Simurgh 0.21 0.24 0.22 214 Thebausffs 0.34 0.27 0.30 179 TimTheTatman 0.33 0.18 0.23 198 Twitch 0.38 0.44 0.41 185 VALORANT_Americas 0.39 0.24 0.30 202 VALORANT_EMEA 0.20 0.25 0.22 203 Valkyrae 0.34 0.30 0.32 185 Velcuz 0.34 0.42 0.38 196 WarThunder_Esports 0.30 0.37 0.33 202 Warframe 0.27 0.57 0.37 188 Xaryu 0.19 0.21 0.20 199 Zentreya 0.30 0.30 0.30 193 Ziqoftw 0.48 0.15 0.23 216 angryginge13 0.30 0.30 0.30 204 caseoh_ 0.36 0.13 0.19 211 chess24 0.41 0.30 0.34 205 easportsfc 0.27 0.25 0.26 203 erobb221 0.36 0.25 0.29 224 fissure_dota_en 0.52 0.21 0.30 211 forsen 0.40 0.59 0.48 199 ironmouse 0.31 0.18 0.23 184 jasontheween 0.17 0.14 0.15 206 k3soju 0.40 0.21 0.27 197 lol_nemesis 0.30 0.18 0.22 199 loltyler1 0.19 0.14 0.16 197 mooda 0.19 0.43 0.27 204 ohnePixel 0.15 0.18 0.16 202 ow_esports 0.16 0.12 0.14 176 plaqueboymax 0.47 0.16 0.24 191 playapex 0.71 0.43 0.54 191 runthefutmarket 0.13 0.20 0.16 192 shanks_ttv 0.17 0.28 0.21 209 shroud 0.26 0.08 0.12 199 sinatraa 0.33 0.12 0.18 207 sodapoppin 0.19 0.25 0.21 207 stableronaldo 0.11 0.10 0.11 191 summit1g 0.67 0.13 0.21 79 supertf 0.28 0.14 0.19 202 tarik 0.33 0.11 0.17 209 vedal987 0.35 0.30 0.33 200 xQc 0.28 0.20 0.23 188 yourragegaming 0.18 0.19 0.18 197 zackrawrr 0.16 0.08 0.11 207 accuracy 0.24 19181 macro avg 0.30 0.24 0.24 19181 weighted avg 0.30 0.24 0.24 19181
# test a few words
# Words chosen to be fairly representative of certain streamers, but moreso give an idea of confidence outputs
test_words = ["omE", "Cereal", "+2", "Vamos", "HYPE", "meow", "?????", "W"]
for word in test_words:
print(f"{word} -> {predict_word_confidence(word, t100_nb_clf, t100_vectorizer, t100_label_mapping)}")
omE -> ('Velcuz', 0.10749637964058793) Cereal -> ('Northernlion', 0.46834438477445667) +2 -> ('Northernlion', 0.48310941232733584) Vamos -> ('VALORANT_Americas', 0.18433942776219692) HYPE -> ('Twitch', 0.07199855009141166) meow -> ('LTANorth', 0.16426181729494213) ????? -> ('shanks_ttv', 0.14745575555816642) W -> ('mooda', 0.06407703459188754)
# Print word cloud of selected streamers labels
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plt.axis('off')
wc = WordCloud().generate(' '.join(t100_data[t100_data['label'] == 'Ludwig']['text']))
plt.imshow(wc)
plt.title('Ludwig')
plt.subplot(1, 3, 2)
plt.axis('off')
wc = WordCloud().generate(' '.join(t100_data[t100_data['label'] == 'GamesDoneQuick']['text']))
plt.imshow(wc)
plt.title('GamesDoneQuick')
plt.subplot(1, 3, 3)
plt.axis('off')
wc = WordCloud().generate(' '.join(t100_data[t100_data['label'] == 'ow_esports']['text']))
plt.imshow(wc)
plt.title('ow_esports')
plt.tight_layout()
plt.show()
Curated List of Top Streamers, 1000 messages per streamer¶
A Curated list of VODs is created to represent the Top 50 English Personalities, and other personalities that should also be included for twitch cultural relevance, or for having unique chats, coming out to 82 Streamers. Vods are hand-selected to be representative. The data is capped to be only 1000 messages each per streamer to cut down on processing time.
Positive results are again obtained, and the relationship between accuracy and confidence is examined.
# Dictionary of vods based on Top 50 Streamers of January 2025
# Curated channels / vods for 5000+ messages, under 10 hours, not controversial (to my understanding),
# relevant english personalities (not event channels), active chats (personally chosen)
# + added notable streamers deeper down top avg viewership list (personally chosen)
# Vods curated to be representative of typical content to my understanding
streamer_vods_dict = {
"KaiCenat": "2369101926",
"caseoh_": "2374536052",
"xQc": "2372442853",
"HasanAbi": "2373350488",
"Jynxzi": "2370784820",
"TimTheTatman": "2368684088",
"yourragegaming": "2371768096",
"loltyler1": "2305694222",
"Emiru": "2368082972",
"stableronaldo": "2369922252",
"sodapoppin": "2368630449",
"Nmplol": "2354008082",
"Thebausffs": "2373924882",
"tarik": "2374297065",
"Fanum": "2371078809",
"Jerma985": "2364495585",
"NiceWigg": "2370111298",
"PaymoneyWubby": "2373603842",
"k3soju": "2374654925",
"Ludwig": "2369047103",
"Clix": "2373494800",
"RDCgaming": "2371018865",
"MOONMOON": "2374355157",
"LIRIK": "2370616578",
"Northernlion": "2373219071",
"shroud": "2371796733",
"Pokimane": "2365252668",
"DougDoug": "2359836833",
"Erobb221": "2353266197",
"summit1g": "2373391595",
"CohhCarnage": "2365882528",
"Gorgc": "2373983009",
"Tenz": "2371956512",
"Ironmouse": "2370341972",
"Elajjaz": "2370520964",
"Silky": "2365668281",
"CDawgVA": "2362828964",
"supertf": "2371930154",
"Ninja": "2368566120",
"Tubbo": "2371501050",
"Rekkles": "2368432207",
"Maximilian_DOOD": "2372857427",
"Duke": "2334535229",
"Vinesauce": "2371038452",
"AdmiralBahroo": "2371570506",
"Doublelift": "2373428024",
"PENTA": "2370870696",
"BotezLive": "2368806611",
"Maya": "2352403390",
"MoistCr1tikal": "2374424054",
"MitchJones": "2346454726",
"WillNeff": "2374535099",
"Scump": "2365093422",
"Atrioc": "2365643219",
"Wirtual": "2366108791",
"39Daph": "2373794511",
"Kitboga": "2374045089",
"Sykkuno": "2372561637",
"RTGame": "2367895634",
"Nymn": "2371329490",
"QTCinderella": "2365701390",
"EsfandTV": "2374433871",
"DrLupo": "2373165739",
"AvoidingThePuddle": "2373458227",
"SmallAnt": "2374224186",
"Cyr": "2374339519",
"GeminiTay": "2358621826",
"Philza": "2368729628",
"LilyPichu": "2368909378",
"jasontheween": "2370876605",
"PirateSoftware": "2372088543",
"Caedrel": "2373860550",
"plaqueboymax": "2371057098",
"Necros": "2370396275",
"ohnePixel": "2372167845",
"Valkyrae": "2373345336",
"tommyinnit": "2328457566",
"Squeex": "2371706296",
"Vargskelethor": "2370922396",
"Coney": "2370966784",
"Emongg": "2373896418",
"shanks_ttv": "2373851247"
}
# print len of streamer_vods_dict
print(f" Number of Streamers: {len(streamer_vods_dict)}")
Number of Streamers: 82
# Download Chats
# for key in streamer_vods_dict:
# download_chat(streamer_vods_dict[key], f"curated/{key}.txt")
A single Chat Download takes about 45 seconds.
# load data
curated_data = load_data_from_directory('curated')
# get count of each label, print all labels with count < 1000
print("Streamers with < 1000 Messages:")
label_counts = curated_data['label'].value_counts()
for label in label_counts.index:
if label_counts[label] < 1000:
print(f"{label}: {label_counts[label]}")
Streamers with < 1000 Messages:
# downsize to 1000
curated_data = downsize_dataframe(curated_data, 'label', 1000)
# Preprocess text (weak)
curated_data['text'] = weak_preprocess_text(curated_data['text'])
# Encode
curated_label_mapping = {label: idx for idx, label in enumerate(curated_data['label'].unique())}
curated_data['label_encoded'] = curated_data['label'].map(curated_label_mapping)
# Vectorize with character n-grams
curated_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
curated_features = curated_vectorizer.fit_transform(curated_data['text']).toarray()
curated_labels = curated_data['label_encoded'].values
# Split data
curated_X_train, curated_X_test, curated_y_train, curated_y_test = train_test_split(curated_features, curated_labels, test_size=0.2, random_state=42)
# NB Classifier
curated_nb_clf = MultinomialNB()
curated_nb_clf.fit(curated_X_train, curated_y_train)
# Evaluate NB Classifier
curated_y_pred = curated_nb_clf.predict(curated_X_test)
curated_accuracy = accuracy_score(curated_y_test, curated_y_pred)
print("NB Accuracy:", curated_accuracy)
NB Accuracy: 0.22628048780487806
# get 10% of curated_data
curated_1p = curated_data.sample(frac=0.1, random_state=42)
# Run predict_word_confidence for each message in curated_1p, save in two new columns
curated_1p['pred_label'], curated_1p['pred_conf'] = zip(*curated_1p['text'].apply(lambda x: predict_word_confidence(x, curated_nb_clf, curated_vectorizer, curated_label_mapping)))
# Side by side boxplots of pred_confidence for correct and incorrect predictions
plt.figure(figsize=(8, 6))
# get count of correct and incorrect predictions
correct_count = curated_1p[curated_1p['label'] == curated_1p['pred_label']].shape[0]
incorrect_count = curated_1p[curated_1p['label'] != curated_1p['pred_label']].shape[0]
plt.boxplot([curated_1p[curated_1p['label'] == curated_1p['pred_label']]['pred_conf'],
curated_1p[curated_1p['label'] != curated_1p['pred_label']]['pred_conf']],
tick_labels=[f'Correct (Count: {correct_count})',f'Incorrect (Count: {incorrect_count})'])
plt.title('Prediction Confidence')
plt.ylabel('Confidence')
plt.show()
# Line graph of accuracy vs confidence
plt.figure(figsize=(8, 6))
# get accuracy for each confidence level
nb_accuracy = []
for i in range(10):
conf = i / 10
nb_accuracy.append(curated_1p[(curated_1p['pred_conf'] >= conf) & (curated_1p['label'] == curated_1p['pred_label'])].shape[0]
/ curated_1p[curated_1p['pred_conf'] >= conf].shape[0] + 0.0001)
# Plot line and points, label points
plt.plot([i / 10 for i in range(10)], nb_accuracy, marker='o', alpha=0.5)
for i, acc in enumerate(nb_accuracy):
plt.text(i / 10, acc + .01, f'{acc:.2f}', fontsize=8, ha='center', va='bottom')
# legend
plt.title('Accuracy vs Confidence')
plt.xlabel('Confidence')
plt.ylabel('Accuracy')
plt.show()
Curated Further to 50 Streamers, 5000 messages per streamer¶
As a Final Implementation, the list is cut to an even top 50 and 5000 messages are used per streamer. This allows for a stronger NB classifier, and an expensive (though not overly) Random Forest Classifier is trained with this setup. RF, XGBoost, and SVM were attempted but took far too long to train. Dimensionality Reduction was also tested, but failed due to time. In revisiting this project, a more expensive model would be an ideal improvement.
The classifier is also examined in a situation to classifying multiple chat messages to a single streamer, and the curve of the accuracy the classifier gains versus the number of chat messages required is plotted. The classifier improves dramatically having more options.
# Top Streamers List curated to 50 streamers for active/engaged chats, or true notability
smaller_streamer_vods_dict = {
"KaiCenat": "2369101926",
"caseoh_": "2374536052",
"xQc": "2372442853",
"HasanAbi": "2373350488",
"Jynxzi": "2370784820",
"TimTheTatman": "2368684088",
"yourragegaming": "2371768096",
"loltyler1": "2305694222",
"Emiru": "2368082972",
"stableronaldo": "2369922252",
"sodapoppin": "2368630449",
"Nmplol": "2354008082",
"Thebausffs": "2373924882",
"tarik": "2374297065",
"Fanum": "2371078809",
"Jerma985": "2364495585",
"NiceWigg": "2370111298",
"PaymoneyWubby": "2373603842",
"k3soju": "2374654925",
"Ludwig": "2369047103",
"Clix": "2373494800",
"RDCgaming": "2371018865",
"MOONMOON": "2374355157",
"LIRIK": "2370616578",
"Northernlion": "2373219071",
"shroud": "2371796733",
"Pokimane": "2365252668",
"DougDoug": "2359836833",
"Erobb221": "2353266197",
"summit1g": "2373391595",
"CohhCarnage": "2365882528",
"Tenz": "2371956512",
"Ironmouse": "2370341972",
"CDawgVA": "2362828964",
"supertf": "2371930154",
"Maximilian_DOOD": "2372857427",
"Duke": "2334535229",
"Vinesauce": "2371038452",
"Doublelift": "2373428024",
"WillNeff": "2374535099",
"Atrioc": "2365643219",
"39Daph": "2373794511",
"Sykkuno": "2372561637",
"Nymn": "2371329490",
"AvoidingThePuddle": "2373458227",
"LilyPichu": "2368909378",
"PirateSoftware": "2372088543",
"ohnePixel": "2372167845",
"Valkyrae": "2373345336",
"Squeex": "2371706296"
}
# Download Chats
# for key in smaller_streamer_vods_dict:
# download_chat(smaller_streamer_vods_dict[key], f"sm_curated_50/{key}.txt")
# load data
smaller_curated_data = load_data_from_directory('sm_curated_50')
# get count of each label, print all labels with count < 5000
print("Streamers with < 5000 Messages:")
label_counts = smaller_curated_data['label'].value_counts()
for label in label_counts.index:
if label_counts[label] < 5000:
print(f"{label}: {label_counts[label]}")
Streamers with < 5000 Messages: Doublelift: 4918 LilyPichu: 3722
This discrepency is ignored, 4% of data slightly undersampled is not mission critical for this exploration.
# downsize to 5000
smaller_curated_data = downsize_dataframe(smaller_curated_data, 'label', 5000)
# Preprocess text (weak)
smaller_curated_data['text'] = weak_preprocess_text(smaller_curated_data['text'])
# Encode
smaller_curated_label_mapping = {label: idx for idx, label in enumerate(smaller_curated_data['label'].unique())}
smaller_curated_data['label_encoded'] = smaller_curated_data['label'].map(smaller_curated_label_mapping)
# Vectorize with character n-grams
smaller_curated_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
smaller_curated_features = smaller_curated_vectorizer.fit_transform(smaller_curated_data['text']).toarray()
smaller_curated_labels = smaller_curated_data['label_encoded'].values
# Save Vectorizer
dump(smaller_curated_vectorizer, "sc_v.joblib")
# Split data
smaller_curated_X_train, smaller_curated_X_test, smaller_curated_y_train, smaller_curated_y_test = train_test_split(smaller_curated_features, smaller_curated_labels, test_size=0.2, random_state=42)
# NB Classifier
smaller_curated_nb_clf = MultinomialNB()
smaller_curated_nb_clf.fit(smaller_curated_X_train, smaller_curated_y_train)
# Save Model
dump(smaller_curated_nb_clf, 'smaller_curated_nb_model.joblib')
# Evaluate NB Classifier
smaller_curated_y_pred = smaller_curated_nb_clf.predict(smaller_curated_X_test)
smaller_curated_accuracy = accuracy_score(smaller_curated_y_test, smaller_curated_y_pred)
print("NB Accuracy:", smaller_curated_accuracy)
NB Accuracy: 0.3043959137709138
# load models
sc_nb = load('smaller_curated_nb_model.joblib')
sc_v = load('sc_v.joblib')
# Plot accuracy vs k for NB
# get results, accuracy for k = [1,10]
accuracies = []
for k in range(1, 21):
results, acc = simulate_k_predictions_for_each_label(smaller_curated_data, k, sc_nb, sc_v, smaller_curated_label_mapping)
accuracies.append(acc)
# Plot accuracy vs k TODO Plot with better models as well
plt.figure(figsize=(8, 6))
plt.plot(range(1, 21), accuracies, marker='o')
plt.title('Prediction Accuracy vs Number of Chat Messages Sampled per Streamer')
plt.xlabel('# Chats Sampled')
plt.ylabel('Accuracy')
plt.xticks(np.arange(0, 21, 2))
plt.yticks(np.arange(0, 1.1, 0.1))
plt.ylim(0, 1)
plt.show()
Chat Similarity Network¶
Similarities are calculated for the curated list of 50 Streamers. Using the Cosine Similarity, a spring graph is created to visualize the chat-based communities among the iconic english streamers. There is potential for more interesting visualizations with this data, though the final graph is the cleanest representation I could create to show groupings.
num_streamers = len(smaller_curated_label_mapping)
# Make matrix of streamers for cosine similarity, jaccard similarity
curated_cosine_similarity = np.zeros((num_streamers, num_streamers))
curated_jaccard_similarity = np.zeros((num_streamers, num_streamers))
# Get words list for each label based on label_encoded
# Words already preprocessed, downsized
words_list = []
for i in range(num_streamers):
words_list.append(' '.join(smaller_curated_data[smaller_curated_data['label_encoded'] == i]['text']).split(' '))
# Get similarities for each pair
for i in range(num_streamers):
for j in range(i+1, num_streamers):
curated_cosine_similarity[i, j] = cosine_sim(words_list[i], words_list[j])
curated_jaccard_similarity[i, j] = jaccard_similarity(words_list[i], words_list[j])
# Side by side boxplots of cosine similarity and jaccard similarity using subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
# Cosine Similarity
axs[0].boxplot(curated_cosine_similarity[np.triu_indices(num_streamers, k=1)])
axs[0].set_title('Cosine Similarity')
axs[0].set_ylabel('Similarity')
axs[0].set_xticks([])
# Jaccard Similarity
axs[1].boxplot(curated_jaccard_similarity[np.triu_indices(num_streamers, k=1)])
axs[1].set_title('Jaccard Similarity')
axs[1].set_xticks([])
plt.show()
# Create a graph from the cosine similarity matrix
G = nx.Graph()
# Add nodes with labels
for i in range(num_streamers):
streamer_name = list(smaller_curated_label_mapping.keys())[list(smaller_curated_label_mapping.values()).index(i)]
G.add_node(i, label=streamer_name)
# Add edges, for each streamer only add the k highest cosine similarity edges if similarity > threshold
k = 1
threshold = .001
for i in range(num_streamers):
top_indices = np.argsort(curated_cosine_similarity[i])[::-1][:k]
for j in top_indices:
if curated_cosine_similarity[i, j] > threshold:
cos_sim = curated_cosine_similarity[i, j]
G.add_edge(i, j, weight=cos_sim)
# Draw the graph
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, weight='weight', seed=42, k=2, iterations=20000)
labels = nx.get_node_attributes(G, 'label')
# Nodes
partition = community.best_partition(G)
unique_communities = set(partition.values())
color_map = {com: plt.cm.tab10(i % 10) for i, com in enumerate(unique_communities)}
node_colors = [color_map[partition[n]] for n in G.nodes()]
node_sizes = [(G.degree(n) * 200) + 200 for n in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_size=node_sizes, edgecolors='black', linewidths=1.5, node_color=node_colors)
# Edges
nx.draw_networkx_edges(G, pos, alpha=1, width=2)
# Labels
texts = []
for node, (x, y) in pos.items():
texts.append(plt.text(x, y, labels[node], fontsize=10, bbox=dict(facecolor='White', edgecolor='none', boxstyle='round,pad=0.15', alpha=.8)))
# Adjust text to avoid overlap
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'))
plt.title('Streamer Cosine Similarity Network')
plt.axis('off')
plt.show()