import subprocess
import requests
import json
import os
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neural_network import MLPClassifier
import networkx as nx
import time
from joblib import dump, load
from sklearn.svm import SVC
from adjustText import adjust_text
import community
from xgboost import XGBClassifier
from sklearn.decomposition import TruncatedSVD

# Download the chat of video_id v to output file o
# Saves in format of [username: message]
def download_chat(v, o, collision="Exit"):
    command = [
        "./TwitchDownloaderCLI",
        "chatdownload",
        "--id", v,
        "-o", o,
        "--timestamp-format", "None",
        "--threads", "32",
        "--collision", collision
    ]
    try:
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        print("Error downloading chat:", e)

# Download chats from list of video_ids, appends txt files together
def download_chat_list(ids, output):
    # download individual chats
    i = 1
    for v in ids:
        download_chat(v, f"v{i}.txt")
        i += 1
    # append chats together
    with open(output, "a") as outfile:
        for i in range(1, len(ids) + 1):
            with open(f"v{i}.txt", "r") as infile:
                outfile.write(infile.read())
        # remove individual chat files
        for i in range(1, len(ids) + 1):
            os.remove(f"v{i}.txt")

client_id = "removed"
client_secret = "removed"

def get_oauth_token():
    global oauth_token
    url = "https://id.twitch.tv/oauth2/token"
    params = {
        "client_id": client_id,
        "client_secret": client_secret,
        "grant_type": "client_credentials"
    }
    response = requests.post(url, params=params)
    oauth_token = response.json()["access_token"]

def get_user_id(username):
    r = requests.get(f"https://api.twitch.tv/helix/users?login={username}", 
                     headers={f"Client-ID":client_id, f"Authorization":f"Bearer {oauth_token}"})
    j = json.loads(r.text)
    id = j['data'][0]['id']
    return id

# returns last 5 vod ids of user_id
def user_vod_ids(user_id):
    r = requests.get(f"https://api.twitch.tv/helix/videos?user_id={user_id}&type=archive&first=5", 
                     headers={f"Client-ID":client_id, f"Authorization":f"Bearer {oauth_token}"})
    j = json.loads(r.text)
    
    return [v['id'] for v in j['data']]

# gets txt files from directory, loads lines into df labelled by file name
def load_data_from_directory(directory):
    data = []
    labels = []
    
    for file in os.listdir(directory):
        if file.endswith('.txt'):
            label = os.path.splitext(file)[0]  # Use file name as label
            with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
                lines = f.readlines()
            data.extend(lines)
            labels.extend([label] * len(lines))
    return pd.DataFrame({'text': data, 'label': labels})

# Downsize dataframe to max_rows per label
def downsize_dataframe(df, label_column, max_rows=10000):
    downsized_df = df.groupby(label_column).apply(lambda x: 
        x.sample(min(len(x), max_rows))).reset_index(drop=True)
    return downsized_df

# Preprocess text data
def preprocess_text(text):
    text = text.str.split(":").str[1]  # Remove usernames
    text = text.str.lower()  # Lowercase
    text = text.str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
    text = text.str.strip()  # Remove leading/trailing whitespace
    return text

# Weaker preprocessing (only remove usernames, whitespace, no removal of punctuation or case) 
def weak_preprocess_text(text):
    text = text.str.split(":").str[1]  # Remove usernames
    text = text.str.strip()  # Remove leading/trailing whitespace
    return text

# t-SNE Visualization
def visualize_with_tsne(features, labels):    
    tsne = TSNE(n_components=2, random_state=42)
    reduced_features = tsne.fit_transform(features)
    
    df_tsne = pd.DataFrame({
        'x': reduced_features[:, 0],
        'y': reduced_features[:, 1],
        'label': labels
    })
    
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        x='x', y='y', hue='label', data=df_tsne, alpha=0.7
    )
    plt.title('t-SNE Visualization')
    plt.legend(loc='best', bbox_to_anchor=(1, 1))
    plt.show()

# PCA Visualization
def visualize_with_pca(features, labels):    
    pca = PCA(n_components=2, random_state=42)
    reduced_features = pca.fit_transform(features)
    
    df_pca = pd.DataFrame({
        'x': reduced_features[:, 0],
        'y': reduced_features[:, 1],
        'label': labels
    })
    
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        x='x', y='y', hue='label', data=df_pca, alpha=0.7
    )
    plt.title('PCA Visualization')
    plt.legend(loc='best', bbox_to_anchor=(1, 1))
    plt.show()

# Jaccard Similarity
# Takes two lists of words, returns float
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

# Cosine Similarity
# Takes two lists of words, returns float
def cosine_sim(list1, list2):
    # Convert lists into strings (TF-IDF requires raw text input)
    text1 = " ".join(list1)
    text2 = " ".join(list2)
    # Vectorize and calc
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return cosine_sim[0][0]

# Predict Word
# Takes a word, label mapping, model, and vectorizer, returns a prediction
def predict_word(word, model, vect, label_mapping):
    return list(label_mapping.keys())[model.predict(vect.transform([word]))[0]]

# Predict Word with Confidence
# Takes a word, label mapping, model, and vectorizer, returns prediction and confidence
def predict_word_confidence(word, model, vect, label_mapping):
    prediction = list(label_mapping.keys())[model.predict(vect.transform([word]))[0]]
    confidence = model.predict_proba(vect.transform([word])).max()
    #all_predictions = {list(label_mapping.keys())[i]: model.predict_proba(vect.transform([word]))[0][i] for i in range(len(label_mapping))}
    return prediction, confidence

# Get Conifdence for all Labels given Word
# Takes a word, label mapping, model, and vectorizer, returns all predictions and their confidence
def get_confidence_for_all_labels(word, model, vect, label_mapping):
    all_predictions = {list(label_mapping.keys())[i]: model.predict_proba(vect.transform([word]))[0][i] for i in range(len(label_mapping))}
    return all_predictions

# for each streamer label, select k random messages in that label and get the average confidence of all labels from predicting the k messages, 
# return a dataframe with each streamer label, the label's highest average confidence, 
# the label with the highest average confidence, and the average confidence of the correct label
# also return the accuracy (% of correct labels)
def simulate_k_predictions_for_each_label(data, k, model, vectorizer, label_mapping):
    results = []
    for label in label_mapping.keys():
        # get k random messages from the label
        sample = data[data['label'] == label].sample(n=k, random_state=42)
        # get the average confidence of all labels from predicting the k messages
        # create sum_confidences dictionary, with labels as keys and 0 as values
        sum_confidences = {l: 0 for l in label_mapping.keys()}
        for text in sample['text']:
            # get dictionary of every label and its confidence
            text_confidences = get_confidence_for_all_labels(text, model, vectorizer, label_mapping)
            # sum the confidences for each label
            for l, c in text_confidences.items():
                sum_confidences[l] += c
        # get the average confidence for each label
        avg_confidences = {l: sum_confidences[l] / k for l in label_mapping.keys()}
        # get the average confidence of the correct label
        correct_avg_confidence = avg_confidences[label]
        # get the label with the highest average confidence
        highest_avg_confidence_label = max(avg_confidences, key=avg_confidences.get)
        highest_avg_confidence = avg_confidences[highest_avg_confidence_label]
        results.append({
            'label': label,
            'highest_avg_confidence': highest_avg_confidence,
            'highest_avg_confidence_label': highest_avg_confidence_label,
            'correct_avg_confidence': correct_avg_confidence
        })
    # calculate accuracy
    correct_predictions = sum(1 for r in results if r['label'] == r['highest_avg_confidence_label'])
    accuracy = correct_predictions / len(results)
    return pd.DataFrame(results), accuracy

# Basic test to download Northernlion's Chat
video_id = "2362321171" # NL 1/24/25
output_file = "txts/nl.txt"
# download_chat(video_id, output_file)

# Basic test to download xQc's Chat
video_id = "2362597129" #xQc 1/24/25
output_file = "txts/xqc.txt"
# download_chat(video_id, output_file)

# Load data
data = load_data_from_directory('txts')

# Downsize data to 10,000 chats per streamer
data = downsize_dataframe(data, 'label', 10000)

# Preprocess text
data['text'] = preprocess_text(data['text'])

# Encode labels numerically
label_mapping = {label: idx for idx, label in enumerate(data['label'].unique())}
data['label_encoded'] = data['label'].map(label_mapping)

# Generate word cloud for only NL
wc = WordCloud().generate(' '.join(data[data['label'] == 'nl']['text']))
plt.axes().set_axis_off()
plt.imshow(wc)

<matplotlib.image.AxesImage at 0x145b7bc50>

# Generate word cloud for xQc
wc = WordCloud().generate(' '.join(data[data['label'] == 'xqc']['text']))
plt.axes().set_axis_off()
plt.imshow(wc)

<matplotlib.image.AxesImage at 0x1465e0ad0>

# Make array of all individual words
nl_words = ' '.join(data[data['label'] == 'nl']['text']).split(' ')
xqc_words = ' '.join(data[data['label'] == 'xqc']['text']).split(' ')

print(f"Jaccard Similarity: {jaccard_similarity(xqc_words, nl_words)}")

Jaccard Similarity: 0.17324976348155155

print(f"Cosine Similarity: {cosine_sim(xqc_words, nl_words)}")

Cosine Similarity: 0.33939988175303515

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000) # Max_Features chosen based on performance/speed tradeoff after testing multiple options
features = vectorizer.fit_transform(data['text']).toarray()
labels = data['label_encoded'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Naive Bayes Classifier
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)

# Evaluate Naive Bayes Classifier
y_pred = nb_clf.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred, target_names=label_mapping.keys()))

Naive Bayes Accuracy: 0.85325
Naive Bayes Classification Report:
               precision    recall  f1-score   support

          nl       0.89      0.81      0.85      2019
         xqc       0.82      0.89      0.86      1981

    accuracy                           0.85      4000
   macro avg       0.86      0.85      0.85      4000
weighted avg       0.86      0.85      0.85      4000

# Logistic Regression Classifier
lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
lr_clf.fit(X_train, y_train)

# Evaluate Logistic Regression Classifier
y_pred = lr_clf.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred, target_names=label_mapping.keys()))

Logistic Regression Accuracy: 0.849
Logistic Regression Classification Report:
               precision    recall  f1-score   support

          nl       0.88      0.81      0.84      2019
         xqc       0.82      0.89      0.85      1981

    accuracy                           0.85      4000
   macro avg       0.85      0.85      0.85      4000
weighted avg       0.85      0.85      0.85      4000

# Random Forest Classifier
rf_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
rf_clf.fit(X_train, y_train)

# Evaluate Random Forest Classifier
y_pred = rf_clf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred, target_names=label_mapping.keys()))

Random Forest Accuracy: 0.84975
Random Forest Classification Report:
               precision    recall  f1-score   support

          nl       0.82      0.91      0.86      2019
         xqc       0.89      0.79      0.84      1981

    accuracy                           0.85      4000
   macro avg       0.85      0.85      0.85      4000
weighted avg       0.85      0.85      0.85      4000

# Vectorize with character n-grams
n_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
n_features = n_vectorizer.fit_transform(data['text']).toarray()
n_labels = data['label_encoded'].values

# Split data
n_X_train, n_X_test, n_y_train, n_y_test = train_test_split(n_features, n_labels, test_size=0.2, random_state=42)

# Naive Bayes Classifier
n_nb_clf = MultinomialNB()
n_nb_clf.fit(n_X_train, n_y_train)

n_y_pred = n_nb_clf.predict(n_X_test)
n_nb_accuracy = accuracy_score(n_y_test, n_y_pred)
print("Naive Bayes Accuracy:", n_nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(n_y_test, n_y_pred, target_names=label_mapping.keys()))

Naive Bayes Accuracy: 0.85325
Naive Bayes Classification Report:
               precision    recall  f1-score   support

          nl       0.89      0.81      0.85      2019
         xqc       0.82      0.89      0.86      1981

    accuracy                           0.85      4000
   macro avg       0.86      0.85      0.85      4000
weighted avg       0.86      0.85      0.85      4000

# Logistic Regression Classifier
n_lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
n_lr_clf.fit(n_X_train, n_y_train)

n_y_pred = n_lr_clf.predict(n_X_test)
n_lr_accuracy = accuracy_score(n_y_test, n_y_pred)
print("Logistic Regression Accuracy:", n_lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(n_y_test, n_y_pred, target_names=label_mapping.keys()))

Logistic Regression Accuracy: 0.85075
Logistic Regression Classification Report:
               precision    recall  f1-score   support

          nl       0.88      0.81      0.85      2019
         xqc       0.82      0.89      0.86      1981

    accuracy                           0.85      4000
   macro avg       0.85      0.85      0.85      4000
weighted avg       0.85      0.85      0.85      4000

# Vectorize with character n-grams
c_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
c_features = c_vectorizer.fit_transform(data['text']).toarray()
c_labels = data['label_encoded'].values

# Split data
c_X_train, c_X_test, c_y_train, c_y_test = train_test_split(c_features, c_labels, test_size=0.2, random_state=42)

# Naive Bayes Classifier
c_nb_clf = MultinomialNB()
c_nb_clf.fit(c_X_train, c_y_train)

c_y_pred = c_nb_clf.predict(c_X_test)
c_nb_accuracy = accuracy_score(c_y_test, c_y_pred)
print("Naive Bayes Accuracy:", c_nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(c_y_test, c_y_pred, target_names=label_mapping.keys()))

Naive Bayes Accuracy: 0.84875
Naive Bayes Classification Report:
               precision    recall  f1-score   support

          nl       0.85      0.85      0.85      2019
         xqc       0.84      0.85      0.85      1981

    accuracy                           0.85      4000
   macro avg       0.85      0.85      0.85      4000
weighted avg       0.85      0.85      0.85      4000

# Logistic Regression Classifier
c_lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
c_lr_clf.fit(c_X_train, c_y_train)

c_y_pred = c_lr_clf.predict(c_X_test)
c_lr_accuracy = accuracy_score(c_y_test, c_y_pred)
print("Logistic Regression Accuracy:", c_lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(c_y_test, c_y_pred, target_names=label_mapping.keys()))

Logistic Regression Accuracy: 0.8625
Logistic Regression Classification Report:
               precision    recall  f1-score   support

          nl       0.87      0.85      0.86      2019
         xqc       0.85      0.87      0.86      1981

    accuracy                           0.86      4000
   macro avg       0.86      0.86      0.86      4000
weighted avg       0.86      0.86      0.86      4000

lp_data = load_data_from_directory('txts')
lp_data = downsize_dataframe(lp_data, 'label', 10000)

lp_data['text'] = weak_preprocess_text(lp_data['text'])

lp_label_mapping = {label: idx for idx, label in enumerate(lp_data['label'].unique())}
lp_data['label_encoded'] = lp_data['label'].map(lp_label_mapping)

# Wordclouds for LP in 1x2 grid
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.axis('off') 
wc = WordCloud().generate(' '.join(lp_data[lp_data['label'] == 'nl']['text']))
plt.imshow(wc)
plt.title('Northernlion')
plt.subplot(1, 2, 2)
plt.axis('off')
wc = WordCloud().generate(' '.join(lp_data[lp_data['label'] == 'xqc']['text']))
plt.imshow(wc)
plt.title('xQc')
plt.tight_layout()
plt.show()

# Get Overall Chat Similarity
lp_nl_words = ' '.join(lp_data[lp_data['label'] == 'nl']['text']).split(' ')
lp_xqc_words = ' '.join(lp_data[lp_data['label'] == 'xqc']['text']).split(' ')

print(f"Less Preprocessing Jaccard Similarity, NL vs xQc: {jaccard_similarity(lp_xqc_words, lp_nl_words)}")
print(f"Less Preprocessing Cosine Similarity, NL vs xQc: {cosine_sim(lp_xqc_words, lp_nl_words)}")

Less Preprocessing Jaccard Similarity, NL vs xQc: 0.13593539703903096
Less Preprocessing Cosine Similarity, NL vs xQc: 0.35153146875446284

# Similarity Between Less Preprocessed vs Normal
print(f"Jaccard Similarity, Less Preprocessed vs Normal, NL: {jaccard_similarity(lp_nl_words, nl_words)}")
print(f"Cosine Similarity, Less Preprocessed vs Normal, NL: {cosine_sim(lp_nl_words, nl_words)}")
print(f"Jaccard Similarity, Less Preprocessed vs Normal, xQc: {jaccard_similarity(lp_xqc_words, xqc_words)}")
print(f"Cosine Similarity, Less Preprocessed vs Normal, xQc: {cosine_sim(lp_xqc_words, xqc_words)}")

Jaccard Similarity, Less Preprocessed vs Normal, NL: 0.24566871390594922
Cosine Similarity, Less Preprocessed vs Normal, NL: 0.9861841344801694
Jaccard Similarity, Less Preprocessed vs Normal, xQc: 0.18620356970574048
Cosine Similarity, Less Preprocessed vs Normal, xQc: 0.9787615415988046

# Bar Graph of Similarities, Normal vs Weak Preprocessing
jaccard_similarities = [jaccard_similarity(xqc_words, nl_words), jaccard_similarity(lp_xqc_words, lp_nl_words)]
cosine_similarities = [cosine_sim(xqc_words, nl_words), cosine_sim(lp_xqc_words, lp_nl_words)]

bar_width = 0.4
index = np.arange(2)

rects1 = plt.bar(index, jaccard_similarities, bar_width, color='b', label='Jaccard')
rects2 = plt.bar(index + bar_width, cosine_similarities, bar_width, color='r', label='Cosine')

plt.title('Similarity Metrics from Preprocessing Methods')
plt.xticks(index + bar_width / 2, ('Normal', 'Weak (Keeps Punctuation, Caps)'))
plt.legend()

plt.tight_layout()
plt.show()

# Basic TF-IDF
lp_vectorizer = TfidfVectorizer(max_features=5000)
lp_features = lp_vectorizer.fit_transform(lp_data['text']).toarray()
lp_labels = lp_data['label_encoded'].values

lp_X_train, lp_X_test, lp_y_train, lp_y_test = train_test_split(lp_features, lp_labels, test_size=0.2, random_state=42)

# Naive Bayes Classifier
lp_nb_clf = MultinomialNB()
lp_nb_clf.fit(lp_X_train, lp_y_train)

lp_y_pred = lp_nb_clf.predict(lp_X_test)
lp_nb_accuracy = accuracy_score(lp_y_test, lp_y_pred)
print("Naive Bayes Accuracy:", lp_nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(lp_y_test, lp_y_pred, target_names=lp_label_mapping.keys()))

Naive Bayes Accuracy: 0.8355
Naive Bayes Classification Report:
               precision    recall  f1-score   support

          nl       0.87      0.79      0.83      2019
         xqc       0.81      0.88      0.84      1981

    accuracy                           0.84      4000
   macro avg       0.84      0.84      0.84      4000
weighted avg       0.84      0.84      0.84      4000

# Logistic Regression Classifier
lp_lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
lp_lr_clf.fit(lp_X_train, lp_y_train)

lp_y_pred = lp_lr_clf.predict(lp_X_test)
lp_lr_accuracy = accuracy_score(lp_y_test, lp_y_pred)
print("Logistic Regression Accuracy:", lp_lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(lp_y_test, lp_y_pred, target_names=lp_label_mapping.keys()))

Logistic Regression Accuracy: 0.83575
Logistic Regression Classification Report:
               precision    recall  f1-score   support

          nl       0.87      0.79      0.83      2019
         xqc       0.81      0.88      0.84      1981

    accuracy                           0.84      4000
   macro avg       0.84      0.84      0.84      4000
weighted avg       0.84      0.84      0.84      4000

# Character n-grams
lp_c_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
lp_c_features = lp_c_vectorizer.fit_transform(lp_data['text']).toarray()
lp_c_labels = lp_data['label_encoded'].values

lp_c_X_train, lp_c_X_test, lp_c_y_train, lp_c_y_test = train_test_split(lp_c_features, lp_c_labels, test_size=0.2, random_state=42)

# NB
lp_c_nb_clf = MultinomialNB()
lp_c_nb_clf.fit(lp_c_X_train, lp_c_y_train)

lp_c_y_pred = lp_c_nb_clf.predict(lp_c_X_test)
lp_c_nb_accuracy = accuracy_score(lp_c_y_test, lp_c_y_pred)
print("Naive Bayes Accuracy:", lp_c_nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(lp_c_y_test, lp_c_y_pred, target_names=lp_label_mapping.keys()))

Naive Bayes Accuracy: 0.84575
Naive Bayes Classification Report:
               precision    recall  f1-score   support

          nl       0.84      0.86      0.85      2019
         xqc       0.85      0.84      0.84      1981

    accuracy                           0.85      4000
   macro avg       0.85      0.85      0.85      4000
weighted avg       0.85      0.85      0.85      4000

# LR
lp_c_lr_clf = LogisticRegression(n_jobs=-1, random_state=42)
lp_c_lr_clf.fit(lp_c_X_train, lp_c_y_train)

lp_c_y_pred = lp_c_lr_clf.predict(lp_c_X_test)
lp_c_lr_accuracy = accuracy_score(lp_c_y_test, lp_c_y_pred)
print("Logistic Regression Accuracy:", lp_c_lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(lp_c_y_test, lp_c_y_pred, target_names=lp_label_mapping.keys()))

Logistic Regression Accuracy: 0.851
Logistic Regression Classification Report:
               precision    recall  f1-score   support

          nl       0.86      0.84      0.85      2019
         xqc       0.84      0.86      0.85      1981

    accuracy                           0.85      4000
   macro avg       0.85      0.85      0.85      4000
weighted avg       0.85      0.85      0.85      4000

# Using lp_c_features (Less Processed, Character N-Grams)
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(lp_c_features)

# Assign cluster labels to each data point
cluster_assignments = kmeans.labels_

# Map clusters to true labels using a manual count
cluster_to_label = {}
for cluster in range(2):
    cluster_indices = np.where(cluster_assignments == cluster)[0]  # Indices of points in this cluster
    cluster_labels = lp_c_labels[cluster_indices]  # True labels for these points
    
    # Count label occurrences in the cluster
    label_counts = Counter(cluster_labels)
    
    # Assign the most common label to the cluster
    if label_counts:
        cluster_to_label[cluster] = max(label_counts, key=label_counts.get)

# Convert cluster assignments into predicted labels
km_y_pred = np.array([cluster_to_label[cluster] for cluster in cluster_assignments])

# Evaluate clustering performance
km_accuracy = accuracy_score(lp_c_labels, km_y_pred)
print("KMeans Accuracy:", km_accuracy)
print("KMeans Classification Report:\n", classification_report(lp_c_labels, km_y_pred, target_names=lp_label_mapping.keys()))

# Bar Graph of Model Accuracies across Methods
accuracies = [nb_accuracy, lr_accuracy, 
              n_nb_accuracy, n_lr_accuracy, 
              c_nb_accuracy, c_lr_accuracy, 
              lp_nb_accuracy, lp_lr_accuracy, 
              lp_c_nb_accuracy, lp_c_lr_accuracy]

nb_accuracies = [nb_accuracy, n_nb_accuracy, c_nb_accuracy, lp_nb_accuracy, lp_c_nb_accuracy]
lr_accuracies = [lr_accuracy, n_lr_accuracy, c_lr_accuracy, lp_lr_accuracy, lp_c_lr_accuracy]

bar_width = 0.4
index = np.arange(len(nb_accuracies))

rects1 = plt.bar(index, nb_accuracies, bar_width, color='b', label='Naive Bayes')
rects2 = plt.bar(index + bar_width, lr_accuracies, bar_width, color='r', label='Logistic Regression')

plt.title('Model Accuracies')
plt.xticks(index + bar_width / 2, ['Standard', 'N-Grams', 'Char N-Grams', 'Weak P.P.', 'Weak P.P. c-N-Grams'], fontsize=8)
plt.legend()
plt.ylim(min(accuracies) - 0.01, max(accuracies) + 0.01)

plt.show()

# Make predictions on full data set using weak preprocessing and character n-grams on naive bayes
full_labels_pred = lp_c_nb_clf.predict(lp_c_features)

# Evaluate LR on full data set
# For sanity, to check for extreme overfitting
print("Full Data NB Accuracy:", accuracy_score(lp_c_labels, full_labels_pred))
print("Full Data NB Classification Report:\n", classification_report(lp_c_labels, full_labels_pred, target_names=lp_label_mapping.keys()))

Full Data NB Accuracy: 0.87315
Full Data NB Classification Report:
               precision    recall  f1-score   support

          nl       0.87      0.88      0.87     10000
         xqc       0.88      0.87      0.87     10000

    accuracy                           0.87     20000
   macro avg       0.87      0.87      0.87     20000
weighted avg       0.87      0.87      0.87     20000

# Get all successful predictions
successful_predictions = data[data['label_encoded'] == full_labels_pred]

# Seperate successful predictions by streamer
nl_successful_predictions = successful_predictions[successful_predictions['label'] == 'nl']
xqc_successful_predictions = successful_predictions[successful_predictions['label'] == 'xqc']

# Get all failed predictions
failed_predictions = data[data['label_encoded'] != full_labels_pred]

# Seperate failed predictions by streamer
xqc_failed_predictions = failed_predictions[failed_predictions['label'] == 'xqc']
nl_failed_predictions = failed_predictions[failed_predictions['label'] == 'nl']

# 2x2 grid of word clouds for successful and failed predictions by streamer
fig, ax = plt.subplots(2, 2, figsize=(14, 7))
ax[0, 0].imshow(WordCloud().generate(' '.join(nl_successful_predictions['text'])))
ax[0, 0].set_title('NL Predicted as NL')
ax[0, 0].axis('off')
ax[0, 1].imshow(WordCloud().generate(' '.join(nl_failed_predictions['text'])))
ax[0, 1].set_title('NL Predicted as xQc')
ax[0, 1].axis('off')
ax[1, 0].imshow(WordCloud().generate(' '.join(xqc_successful_predictions['text'])))
ax[1, 0].set_title('xQc Predicted as xQc')
ax[1, 0].axis('off')
ax[1, 1].imshow(WordCloud().generate(' '.join(xqc_failed_predictions['text'])))
ax[1, 1].set_title('xQc Predicted as NL')
ax[1, 1].axis('off')
plt.tight_layout()
plt.show()

# Get statistics of failed predictions vs full dataset
# Get chat lengths
all_chat_lengths = data['text'].str.len()
failed_chat_lengths = failed_predictions['text'].str.len()

# Box and Whisker Plots of lengths
fig, ax = plt.subplots(1, 2, figsize=(12, 7))
ax[0].boxplot([all_chat_lengths, failed_chat_lengths], tick_labels=['All Chats', 'Failed Predictions'])
ax[0].set_title('Chat Character Limit')
ax[1].boxplot([all_chat_lengths, failed_chat_lengths], tick_labels=['All Chats', 'Failed Predictions'])
ax[1].set_ylim(-2, 52)
ax[1].set_title('Chat Character Limit, Zoomed In')
plt.tight_layout()
plt.show()

# # SVM Classifier
# svm = SVC(probability=True)
# svm.fit(lp_c_X_train, lp_c_y_train)
# svm_y_pred = svm.predict(lp_c_X_test)
# svm_accuracy = accuracy_score(lp_c_y_test, svm_y_pred)
# print("SVM Accuracy:", svm_accuracy)
# print("SVM Classification Report:\n", classification_report(lp_c_y_test, svm_y_pred, target_names=lp_label_mapping.keys()))

# Create Results Dataframe
cols = ["label1", "label2", "accuracy", "jaccard_sim", "cosine_sim"]
pairwise_results = pd.DataFrame(columns=cols)

# Set Pairs
l1 = ["nl", "nl", "nl", "nl", "xqc", "xqc", "xqc", "ibai", "ibai", "tommy"]
l2 = ["xqc", "ibai", "tommy", "jack", "ibai", "tommy", "jack", "tommy", "jack", "jack"]
pairwise_results["label1"] = l1
pairwise_results["label2"] = l2

# List of vods
vods_dict = {
    "xqc": "2362597129", # React, Streamer Life Sim 1/24/25
    "nl": "2362321171", # Playing Puck, Jackbox 1/24/25
    "ibai": "2351403650", # Live WC Soccer React
    "tommy": "2328457566", # Joining TubboSMP
    "jack": "2352536292" # Tubbo SMP (More Recently)
}

# Download Chats from vods_dict
# for key in vods_dict:
#     download_chat(vods_dict[key], f"pairs_txts/{key}.txt")

# load
pairs_data = load_data_from_directory('pairs_txts')

# downsize to 5000
pairs_data = downsize_dataframe(pairs_data, 'label', 5000)

# Preprocess text (weak)
pairs_data['text'] = weak_preprocess_text(pairs_data['text'])

# Encode
pairs_label_mapping = {label: idx for idx, label in enumerate(pairs_data['label'].unique())}
pairs_data['label_encoded'] = pairs_data['label'].map(pairs_label_mapping)

# 2x3 Plots for Wordclouds for all 5 streamers
labels = ['nl', 'xqc', 'ibai', 'tommy', 'jack']
titles = ['Northernlion', 'xQc', 'Ibai', 'TommyInnit', 'JackManifoldTV']

plt.figure(figsize=(12, 5))
for i, (label, title) in enumerate(zip(labels, titles), 1):
    plt.subplot(2, 3, i)
    plt.axis('off')
    wc = WordCloud().generate(' '.join(pairs_data[pairs_data['label'] == label]['text']))
    plt.imshow(wc)
    plt.title(title)
plt.tight_layout()
plt.show()

# Generate similarities for pairwise_results
for i in range(len(pairwise_results)):
    l1 = pairwise_results.iloc[i]["label1"]
    l2 = pairwise_results.iloc[i]["label2"]
    l1_words = ' '.join(pairs_data[pairs_data['label'] == l1]['text']).split(' ')
    l2_words = ' '.join(pairs_data[pairs_data['label'] == l2]['text']).split(' ')
    pairwise_results.iloc[i, 3] = jaccard_similarity(l1_words, l2_words)
    pairwise_results.iloc[i, 4] = cosine_sim(l1_words, l2_words)
    
pairwise_results

# Plot jaccard similarity vs cosine similarity
plt.figure(figsize=(4, 3))
plt.scatter(pairwise_results['jaccard_sim'], pairwise_results['cosine_sim'], c='r')
plt.xlabel('Jaccard Similarity')
plt.ylabel('Cosine Similarity')
plt.title('Jaccard Similarity vs Cosine Similarity')
# calculate correlation
correlation = pairwise_results['jaccard_sim'].corr(pairwise_results['cosine_sim'])
plt.text(.025, 0.675, f'Correlation: {correlation:.2f}', fontsize=8)
plt.show()

# Vectorize with character n-grams
pairs_c_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
pairs_c_features = pairs_c_vectorizer.fit_transform(pairs_data['text']).toarray()
pairs_c_labels = pairs_data['label_encoded'].values

# For each pair, split data, train and evaluate NB classifier and get accuracy

for i in range(len(pairwise_results)):
    l1 = pairwise_results.iloc[i]["label1"]
    l2 = pairwise_results.iloc[i]["label2"]
    pair_features = pairs_c_features[(pairs_data['label'] == l1) | (pairs_data['label'] == l2)]
    pair_labels = pairs_c_labels[(pairs_data['label'] == l1) | (pairs_data['label'] == l2)]
    
    pair_X_train, pair_X_test, pair_y_train, pair_y_test = train_test_split(pair_features, pair_labels, test_size=0.2, random_state=42)
    
    pair_nb_clf = MultinomialNB()
    pair_nb_clf.fit(pair_X_train, pair_y_train)
    
    pair_y_pred = pair_nb_clf.predict(pair_X_test)
    pair_accuracy = accuracy_score(pair_y_test, pair_y_pred)
    pairwise_results.iloc[i, 2] = pair_accuracy
    
pairwise_results

# Plot accuracy vs jaccard similarity, accuracy vs cosine similarity
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(pairwise_results['jaccard_sim'], pairwise_results['accuracy'])
plt.xlabel('Jaccard Similarity')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Jaccard Similarity')
plt.subplot(1, 2, 2)
plt.scatter(pairwise_results['cosine_sim'], pairwise_results['accuracy'])
plt.xlabel('Cosine Similarity')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Cosine Similarity')
plt.tight_layout()
plt.show()

# Make network graph based off of pairwise similarities
G = nx.Graph()

G.add_nodes_from(pairwise_results["label1"].unique())
for i in range(len(pairwise_results)):
    distance = 1 / (2 ** (pairwise_results.iloc[i]["cosine_sim"] * 10))
    G.add_edge(pairwise_results.iloc[i]["label1"], pairwise_results.iloc[i]["label2"], weight=1/distance)

pos = nx.spring_layout(G, weight='weight', k=10, iterations=5000)

plt.figure(figsize=(8, 6))
plt.title('Cosine Similarity Between Streamer Chats Network Graph')
nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=1500, font_weight='bold')

# Draw edge labels (similarities)
edge_labels = {(u, v): f'{(np.log2(d["weight"]) / 10):.3f}' for u, v, d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

plt.show()

# List of vods for proof of concept (removing jack to eliminate tommy/jack overlap)
poc_vods_dict = {
    "xqc": "2362597129", # React, Streamer Life Sim 1/24/25
    "nl": "2362321171", # Playing Puck, Jackbox 1/24/25
    "ibai": "2351403650", # Live WC Soccer React
    "tommy": "2328457566", # Joining TubboSMP
}

# Download Chats from vods_dict
# for key in poc_vods_dict:
#     download_chat(poc_vods_dict[key], f"poc_txts/{key}.txt")

# load
poc_data = load_data_from_directory('poc_txts')

# downsize to 5000
poc_data = downsize_dataframe(poc_data, 'label', 5000)

# Preprocess text (weak)
poc_data['text'] = weak_preprocess_text(poc_data['text'])

# Encode
poc_label_mapping = {label: idx for idx, label in enumerate(poc_data['label'].unique())}
poc_data['label_encoded'] = poc_data['label'].map(poc_label_mapping)

# Vectorize with character n-grams
poc_c_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
poc_c_features = poc_c_vectorizer.fit_transform(poc_data['text']).toarray()
poc_c_labels = poc_data['label_encoded'].values

# Split data
poc_X_train, poc_X_test, poc_y_train, poc_y_test = train_test_split(poc_c_features, poc_c_labels, test_size=0.2, random_state=42)

# NB Classifier
poc_nb_clf = MultinomialNB()
poc_nb_clf.fit(poc_X_train, poc_y_train)

# Evaluate NB Classifier
poc_y_pred = poc_nb_clf.predict(poc_X_test)
poc_accuracy = accuracy_score(poc_y_test, poc_y_pred)
print("NB Accuracy:", poc_accuracy)
print("NB Classification Report:\n", classification_report(poc_y_test, poc_y_pred, target_names=poc_label_mapping.keys()))

NB Accuracy: 0.76825
NB Classification Report:
               precision    recall  f1-score   support

        ibai       0.88      0.88      0.88       995
          nl       0.76      0.72      0.74      1024
       tommy       0.70      0.76      0.73      1004
         xqc       0.73      0.71      0.72       977

    accuracy                           0.77      4000
   macro avg       0.77      0.77      0.77      4000
weighted avg       0.77      0.77      0.77      4000

# Likely words dictionary (manually defined based on personal understanding of respective chat)
likely_words = {
    "xqc": "omE",
    "nl": "Cereal",
    "ibai": "que",
    "tommy": "GayPride"
}

# predict likely words
for key in likely_words:
    print(f"{key} ({likely_words[key]}) -> {predict_word(likely_words[key], poc_nb_clf, poc_c_vectorizer, poc_label_mapping)}")

xqc (omE) -> xqc
nl (Cereal) -> nl
ibai (que) -> ibai
tommy (GayPride) -> tommy

# Get likely words with confidence
for key in likely_words:
    print(f"{key} ({likely_words[key]}) -> {predict_word_confidence(likely_words[key], poc_nb_clf, poc_c_vectorizer, poc_label_mapping)}")

xqc (omE) -> ('xqc', 0.985383877205551)
nl (Cereal) -> ('nl', 0.9997923678429135)
ibai (que) -> ('ibai', 0.9993385561171404)
tommy (GayPride) -> ('tommy', 0.9937931223214658)

for key in likely_words:
    print(f"{key} ({likely_words[key]}) -> {get_confidence_for_all_labels(likely_words[key], poc_nb_clf, poc_c_vectorizer, poc_label_mapping)}")

xqc (omE) -> {'ibai': 0.00039032212695792027, 'nl': 0.0014938117427326183, 'tommy': 0.012731988924757824, 'xqc': 0.985383877205551}
nl (Cereal) -> {'ibai': 4.7077612713758734e-05, 'nl': 0.9997923678429135, 'tommy': 6.909077060448049e-05, 'xqc': 9.146377377103007e-05}
ibai (que) -> {'ibai': 0.9993385561171404, 'nl': 0.0002265670485358673, 'tommy': 0.00010158189175076548, 'xqc': 0.00033329494257399746}
tommy (GayPride) -> {'ibai': 0.0011773986891216652, 'nl': 0.003537630939901814, 'tommy': 0.9937931223214658, 'xqc': 0.0014918480495116312}

# 5000 -> 1000 chat messages per streamer, 20000 -> 5000 max features
# load
cheap_poc_data = load_data_from_directory('poc_txts')

# 5000 -> 1000 chat messages per streamer
cheap_poc_data = downsize_dataframe(cheap_poc_data, 'label', 1000)

# Preprocess text (weak)
cheap_poc_data['text'] = weak_preprocess_text(cheap_poc_data['text'])

# Encode
cheap_poc_label_mapping = {label: idx for idx, label in enumerate(cheap_poc_data['label'].unique())}
cheap_poc_data['label_encoded'] = cheap_poc_data['label'].map(cheap_poc_label_mapping)

# Vectorize with character n-grams, 20000 -> 5000 max features
cheap_poc_vectorizer = TfidfVectorizer(max_features=5000, analyzer='char_wb', ngram_range=(1, 5))
cheap_poc_features = cheap_poc_vectorizer.fit_transform(cheap_poc_data['text']).toarray()
cheap_poc_labels = cheap_poc_data['label_encoded'].values

# Split data
cheap_poc_X_train, cheap_poc_X_test, cheap_poc_y_train, cheap_poc_y_test = train_test_split(cheap_poc_features, cheap_poc_labels, test_size=0.2, random_state=42)

# NB Classifier
cheap_poc_nb_clf = MultinomialNB()
cheap_poc_nb_clf.fit(cheap_poc_X_train, cheap_poc_y_train)

# Evaluate NB Classifier
cheap_poc_y_pred = cheap_poc_nb_clf.predict(cheap_poc_X_test)
cheap_poc_accuracy = accuracy_score(cheap_poc_y_test, cheap_poc_y_pred)
print("NB Accuracy:", cheap_poc_accuracy)
print("NB Classification Report:\n", classification_report(cheap_poc_y_test, cheap_poc_y_pred, target_names=cheap_poc_label_mapping.keys()))

NB Accuracy: 0.69875
NB Classification Report:
               precision    recall  f1-score   support

        ibai       0.83      0.84      0.83       209
          nl       0.70      0.60      0.65       213
       tommy       0.63      0.65      0.64       194
         xqc       0.63      0.70      0.66       184

    accuracy                           0.70       800
   macro avg       0.70      0.70      0.70       800
weighted avg       0.70      0.70      0.70       800

# Get likely words with confidence
for key in likely_words:
    print(f"{key} ({likely_words[key]}) -> {predict_word_confidence(likely_words[key], cheap_poc_nb_clf, cheap_poc_vectorizer, cheap_poc_label_mapping)}")

xqc (omE) -> ('xqc', 0.895962644027219)
nl (Cereal) -> ('nl', 0.9941023736550885)
ibai (que) -> ('ibai', 0.9944629253379634)
tommy (GayPride) -> ('tommy', 0.9417083636400793)

for key in likely_words:
    print(f"{key} ({likely_words[key]}) -> {get_confidence_for_all_labels(likely_words[key], cheap_poc_nb_clf, cheap_poc_vectorizer, cheap_poc_label_mapping)}")

xqc (omE) -> {'ibai': 0.009010011324352836, 'nl': 0.018529435312124486, 'tommy': 0.07649790933630353, 'xqc': 0.895962644027219}
nl (Cereal) -> {'ibai': 0.0010749494167538163, 'nl': 0.9941023736550885, 'tommy': 0.0013668750933012144, 'xqc': 0.0034558018348584347}
ibai (que) -> {'ibai': 0.9944629253379634, 'nl': 0.0022253504209979354, 'tommy': 0.0010937822202266318, 'xqc': 0.0022179420208105023}
tommy (GayPride) -> {'ibai': 0.01699206086863734, 'nl': 0.025082190675351754, 'tommy': 0.9417083636400793, 'xqc': 0.016217384815931157}

# Top 100 English Speaking Streamers of January 2025, from TwitchTracker.com 
top100streamers = ["Caedrel", "zackrawrr", "KaiCenat", "caseoh_", "GamesDoneQuick", 
                   "HasanAbi", "BLASTPremier", "sodapoppin", "plaqueboymax", "Jynxzi", 
                   "ohnePixel", "TimTheTatman", "xQc", "loltyler1", "shroud", 
                   "PirateSoftware", "tarik", "stableronaldo", "sinatraa", "Pikabooirl", 
                   "summit1g", "easportsfc", "jasontheween", "Thebausffs", "yourragegaming", 
                   "erobb221", "Mizkif", "Lacy", "Xaryu", "Quin69", 
                   "ESLCS", "Necros", "LCK", "LEC", "vedal987", 
                   "Ludwig", "Clix", "fissure_dota_en", "forsen", "Agent00", 
                   "NiceWigg", "k3soju", "Velcuz", "Emiru", "MOONMOON", 
                   "Twitch", "Gorgc", "LIRIK", "Warframe", "Lord_Kebun", 
                   "ironmouse", "RocketLeague", "VALORANT_EMEA", "Elajjaz", "ow_esports", 
                   "playapex", "ESLCSb", "LTANorth", "Northernlion", "Valkyrae", 
                   "Rekkles", "angryginge13", "supertf", "Nmplol", "MrSavage", 
                   "Ziqoftw", "RDCgaming", "CohhCarnage", "LVNDMARK", "DonaldTrump", 
                   "VALORANT_Americas", "Fanum", "shanks_ttv", "Nadeshot", "PaymoneyWubby", 
                   "AuzioMF", "runthefutmarket", "2xRaKai", "WarThunder_Esports", "Adapt", 
                   "ExtraEmily", "mooda", "Simurgh", "HLTVorg", "Grubby", 
                   "Jerma985", "Ninja", "Unboxholics", "AussieAntics", "Silky", 
                   "BarbarousKing", "AdmiralBahroo", "chess24", "Zentreya", "PENTA", 
                   "PlayHearthstone", "Psychoghost", "CDawgVA", "Glorious_E", "lol_nemesis"]

# # Get vod ids for top 100 streamers
# top100_vod_id = {}
# for streamer in top100streamers:
#     user_id = get_user_id(streamer)
#     if user_id:
#         vod_ids = user_vod_ids(user_id)
#         # delay to avoid rate-limiting
#         time.sleep(.2)
#         if vod_ids:
#             top100_vod_id[streamer] = vod_ids[0]
#         else:
#             print(f"No vods found for {streamer}")
#     else:
#         print(f"No user found for {streamer}")

# download chats for top 100 streamers
# for streamer in top100_vod_id:
#     download_chat(top100_vod_id[streamer], f"t100/{streamer}.txt")

# load data
t100_data = load_data_from_directory('t100')

# get count of each label, print all labels with count < 1000
print("Labels with count < 1000:")
label_counts = t100_data['label'].value_counts()
for label in label_counts.index:
    if label_counts[label] < 1000:
        print(f"{label}: {label_counts[label]}")

Labels with count < 1000:
HasanAbi: 802
Rekkles: 689
summit1g: 414

# downsize to 1000
t100_data = downsize_dataframe(t100_data, 'label', 1000)

# Preprocess text (weak)
t100_data['text'] = weak_preprocess_text(t100_data['text'])

# Encode
t100_label_mapping = {label: idx for idx, label in enumerate(t100_data['label'].unique())}
t100_data['label_encoded'] = t100_data['label'].map(t100_label_mapping)

# Vectorize with character n-grams
t100_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
t100_features = t100_vectorizer.fit_transform(t100_data['text']).toarray()
t100_labels = t100_data['label_encoded'].values

# Split data
t100_X_train, t100_X_test, t100_y_train, t100_y_test = train_test_split(t100_features, t100_labels, test_size=0.2, random_state=42)

# NB Classifier
t100_nb_clf = MultinomialNB()
t100_nb_clf.fit(t100_X_train, t100_y_train)

MultinomialNB()

MultinomialNB()

# Evaluate NB Classifier
t100_y_pred = t100_nb_clf.predict(t100_X_test)
t100_accuracy = accuracy_score(t100_y_test, t100_y_pred)
print("NB Accuracy:", t100_accuracy)
print("NB Classification Report:\n", classification_report(t100_y_test, t100_y_pred, target_names=t100_label_mapping.keys()))

NB Accuracy: 0.23883009227881757
NB Classification Report:
                     precision    recall  f1-score   support

             Adapt       0.24      0.06      0.09       195
     AdmiralBahroo       0.21      0.36      0.27       187
           Agent00       0.30      0.08      0.13       214
      AussieAntics       0.08      0.40      0.13       195
           AuzioMF       0.34      0.28      0.30       174
      BLASTPremier       0.38      0.25      0.30       179
     BarbarousKing       0.29      0.15      0.19       220
           CDawgVA       0.33      0.20      0.25       196
           Caedrel       0.55      0.23      0.33       212
              Clix       0.36      0.09      0.14       201
       CohhCarnage       0.34      0.52      0.42       197
             ESLCS       0.26      0.38      0.31       203
            ESLCSb       0.19      0.13      0.15       212
           Elajjaz       0.43      0.26      0.33       187
             Emiru       0.27      0.21      0.24       194
        ExtraEmily       0.32      0.26      0.29       184
             Fanum       0.33      0.17      0.23       210
    GamesDoneQuick       0.31      0.33      0.32       193
        Glorious_E       0.30      0.20      0.24       192
             Gorgc       0.31      0.16      0.21       202
            Grubby       0.10      0.43      0.16       195
           HLTVorg       0.41      0.30      0.35       212
          HasanAbi       0.44      0.37      0.40       150
          Jerma985       0.20      0.31      0.24       208
            Jynxzi       0.38      0.21      0.27       206
          KaiCenat       0.29      0.14      0.18       185
               LCK       0.16      0.34      0.22       180
               LEC       0.24      0.10      0.14       194
             LIRIK       0.19      0.13      0.15       205
          LTANorth       0.44      0.15      0.22       193
          LVNDMARK       0.15      0.25      0.19       208
              Lacy       0.20      0.10      0.13       188
        Lord_Kebun       0.21      0.23      0.22       220
            Ludwig       0.40      0.47      0.43       191
          MOONMOON       0.30      0.16      0.21       218
            Mizkif       0.17      0.11      0.13       199
          MrSavage       0.37      0.21      0.26       218
          Nadeshot       0.17      0.21      0.19       214
            Necros       0.37      0.28      0.31       206
          NiceWigg       0.34      0.20      0.26       201
             Ninja       0.19      0.19      0.19       206
            Nmplol       0.20      0.16      0.18       218
      Northernlion       0.25      0.18      0.21       219
             PENTA       0.41      0.22      0.28       199
     PaymoneyWubby       0.38      0.21      0.27       187
        Pikabooirl       0.16      0.05      0.08       203
    PirateSoftware       0.13      0.19      0.15       208
   PlayHearthstone       0.37      0.26      0.30       192
       Psychoghost       0.06      0.56      0.11       187
            Quin69       0.30      0.41      0.35       192
         RDCgaming       0.62      0.27      0.38       199
           Rekkles       0.66      0.28      0.39       160
      RocketLeague       0.33      0.34      0.34       196
             Silky       0.20      0.24      0.22       226
           Simurgh       0.21      0.24      0.22       214
        Thebausffs       0.34      0.27      0.30       179
      TimTheTatman       0.33      0.18      0.23       198
            Twitch       0.38      0.44      0.41       185
 VALORANT_Americas       0.39      0.24      0.30       202
     VALORANT_EMEA       0.20      0.25      0.22       203
          Valkyrae       0.34      0.30      0.32       185
            Velcuz       0.34      0.42      0.38       196
WarThunder_Esports       0.30      0.37      0.33       202
          Warframe       0.27      0.57      0.37       188
             Xaryu       0.19      0.21      0.20       199
          Zentreya       0.30      0.30      0.30       193
           Ziqoftw       0.48      0.15      0.23       216
      angryginge13       0.30      0.30      0.30       204
           caseoh_       0.36      0.13      0.19       211
           chess24       0.41      0.30      0.34       205
        easportsfc       0.27      0.25      0.26       203
          erobb221       0.36      0.25      0.29       224
   fissure_dota_en       0.52      0.21      0.30       211
            forsen       0.40      0.59      0.48       199
         ironmouse       0.31      0.18      0.23       184
      jasontheween       0.17      0.14      0.15       206
            k3soju       0.40      0.21      0.27       197
       lol_nemesis       0.30      0.18      0.22       199
         loltyler1       0.19      0.14      0.16       197
             mooda       0.19      0.43      0.27       204
         ohnePixel       0.15      0.18      0.16       202
        ow_esports       0.16      0.12      0.14       176
      plaqueboymax       0.47      0.16      0.24       191
          playapex       0.71      0.43      0.54       191
   runthefutmarket       0.13      0.20      0.16       192
        shanks_ttv       0.17      0.28      0.21       209
            shroud       0.26      0.08      0.12       199
          sinatraa       0.33      0.12      0.18       207
        sodapoppin       0.19      0.25      0.21       207
     stableronaldo       0.11      0.10      0.11       191
          summit1g       0.67      0.13      0.21        79
           supertf       0.28      0.14      0.19       202
             tarik       0.33      0.11      0.17       209
          vedal987       0.35      0.30      0.33       200
               xQc       0.28      0.20      0.23       188
    yourragegaming       0.18      0.19      0.18       197
         zackrawrr       0.16      0.08      0.11       207

          accuracy                           0.24     19181
         macro avg       0.30      0.24      0.24     19181
      weighted avg       0.30      0.24      0.24     19181

# test a few words
# Words chosen to be fairly representative of certain streamers, but moreso give an idea of confidence outputs
test_words = ["omE", "Cereal", "+2", "Vamos", "HYPE", "meow", "?????", "W"]
for word in test_words:
    print(f"{word} -> {predict_word_confidence(word, t100_nb_clf, t100_vectorizer, t100_label_mapping)}")

omE -> ('Velcuz', 0.10749637964058793)
Cereal -> ('Northernlion', 0.46834438477445667)
+2 -> ('Northernlion', 0.48310941232733584)
Vamos -> ('VALORANT_Americas', 0.18433942776219692)
HYPE -> ('Twitch', 0.07199855009141166)
meow -> ('LTANorth', 0.16426181729494213)
????? -> ('shanks_ttv', 0.14745575555816642)
W -> ('mooda', 0.06407703459188754)

# Print word cloud of selected streamers labels
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plt.axis('off')
wc = WordCloud().generate(' '.join(t100_data[t100_data['label'] == 'Ludwig']['text']))
plt.imshow(wc)
plt.title('Ludwig')
plt.subplot(1, 3, 2)
plt.axis('off')
wc = WordCloud().generate(' '.join(t100_data[t100_data['label'] == 'GamesDoneQuick']['text']))
plt.imshow(wc)
plt.title('GamesDoneQuick')
plt.subplot(1, 3, 3)
plt.axis('off')
wc = WordCloud().generate(' '.join(t100_data[t100_data['label'] == 'ow_esports']['text']))
plt.imshow(wc)
plt.title('ow_esports')
plt.tight_layout()
plt.show()

# Dictionary of vods based on Top 50 Streamers of January 2025
# Curated channels / vods for 5000+ messages, under 10 hours, not controversial (to my understanding),
# relevant english personalities (not event channels), active chats (personally chosen)
# + added notable streamers deeper down top avg viewership list (personally chosen)
# Vods curated to be representative of typical content to my understanding
streamer_vods_dict = {
    "KaiCenat": "2369101926",
    "caseoh_": "2374536052",
    "xQc": "2372442853",
    "HasanAbi": "2373350488",
    "Jynxzi": "2370784820",
    "TimTheTatman": "2368684088",
    "yourragegaming": "2371768096",
    "loltyler1": "2305694222",
    "Emiru": "2368082972",
    "stableronaldo": "2369922252",
    "sodapoppin": "2368630449",
    "Nmplol": "2354008082",
    "Thebausffs": "2373924882",
    "tarik": "2374297065",
    "Fanum": "2371078809",
    "Jerma985": "2364495585",
    "NiceWigg": "2370111298",
    "PaymoneyWubby": "2373603842",
    "k3soju": "2374654925",
    "Ludwig": "2369047103",
    "Clix": "2373494800",
    "RDCgaming": "2371018865",
    "MOONMOON": "2374355157",
    "LIRIK": "2370616578",
    "Northernlion": "2373219071",
    "shroud": "2371796733",
    "Pokimane": "2365252668",
    "DougDoug": "2359836833",
    "Erobb221": "2353266197",
    "summit1g": "2373391595",
    "CohhCarnage": "2365882528",
    "Gorgc": "2373983009",
    "Tenz": "2371956512",
    "Ironmouse": "2370341972",
    "Elajjaz": "2370520964",
    "Silky": "2365668281",
    "CDawgVA": "2362828964",
    "supertf": "2371930154",
    "Ninja": "2368566120",
    "Tubbo": "2371501050",
    "Rekkles": "2368432207",
    "Maximilian_DOOD": "2372857427",
    "Duke": "2334535229",
    "Vinesauce": "2371038452",
    "AdmiralBahroo": "2371570506",
    "Doublelift": "2373428024",
    "PENTA": "2370870696",
    "BotezLive": "2368806611",
    "Maya": "2352403390",
    "MoistCr1tikal": "2374424054",
    "MitchJones": "2346454726",
    "WillNeff": "2374535099",
    "Scump": "2365093422",
    "Atrioc": "2365643219",
    "Wirtual": "2366108791",
    "39Daph": "2373794511",
    "Kitboga": "2374045089",
    "Sykkuno": "2372561637",
    "RTGame": "2367895634",
    "Nymn": "2371329490",
    "QTCinderella": "2365701390",
    "EsfandTV": "2374433871",
    "DrLupo": "2373165739",
    "AvoidingThePuddle": "2373458227",
    "SmallAnt": "2374224186",
    "Cyr": "2374339519",
    "GeminiTay": "2358621826",
    "Philza": "2368729628",
    "LilyPichu": "2368909378",
    "jasontheween": "2370876605",
    "PirateSoftware": "2372088543",
    "Caedrel": "2373860550",
    "plaqueboymax": "2371057098",
    "Necros": "2370396275",
    "ohnePixel": "2372167845",
    "Valkyrae": "2373345336",
    "tommyinnit": "2328457566",
    "Squeex": "2371706296",
    "Vargskelethor": "2370922396",
    "Coney": "2370966784",
    "Emongg": "2373896418",
    "shanks_ttv": "2373851247"
}

# print len of streamer_vods_dict
print(f" Number of Streamers: {len(streamer_vods_dict)}")

 Number of Streamers: 82

# Download Chats
# for key in streamer_vods_dict:
#     download_chat(streamer_vods_dict[key], f"curated/{key}.txt")

# load data
curated_data = load_data_from_directory('curated')

# get count of each label, print all labels with count < 1000
print("Streamers with < 1000 Messages:")
label_counts = curated_data['label'].value_counts()
for label in label_counts.index:
    if label_counts[label] < 1000:
        print(f"{label}: {label_counts[label]}")

Streamers with < 1000 Messages:

# downsize to 1000
curated_data = downsize_dataframe(curated_data, 'label', 1000)
# Preprocess text (weak)
curated_data['text'] = weak_preprocess_text(curated_data['text'])
# Encode
curated_label_mapping = {label: idx for idx, label in enumerate(curated_data['label'].unique())}
curated_data['label_encoded'] = curated_data['label'].map(curated_label_mapping)

# Vectorize with character n-grams
curated_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
curated_features = curated_vectorizer.fit_transform(curated_data['text']).toarray()
curated_labels = curated_data['label_encoded'].values

# Split data
curated_X_train, curated_X_test, curated_y_train, curated_y_test = train_test_split(curated_features, curated_labels, test_size=0.2, random_state=42)

# NB Classifier
curated_nb_clf = MultinomialNB()
curated_nb_clf.fit(curated_X_train, curated_y_train)
# Evaluate NB Classifier
curated_y_pred = curated_nb_clf.predict(curated_X_test)
curated_accuracy = accuracy_score(curated_y_test, curated_y_pred)
print("NB Accuracy:", curated_accuracy)

NB Accuracy: 0.22628048780487806

# get 10% of curated_data
curated_1p = curated_data.sample(frac=0.1, random_state=42)
# Run predict_word_confidence for each message in curated_1p, save in two new columns
curated_1p['pred_label'], curated_1p['pred_conf'] = zip(*curated_1p['text'].apply(lambda x: predict_word_confidence(x, curated_nb_clf, curated_vectorizer, curated_label_mapping)))

# Side by side boxplots of pred_confidence for correct and incorrect predictions
plt.figure(figsize=(8, 6))
# get count of correct and incorrect predictions
correct_count = curated_1p[curated_1p['label'] == curated_1p['pred_label']].shape[0]
incorrect_count = curated_1p[curated_1p['label'] != curated_1p['pred_label']].shape[0]
plt.boxplot([curated_1p[curated_1p['label'] == curated_1p['pred_label']]['pred_conf'], 
             curated_1p[curated_1p['label'] != curated_1p['pred_label']]['pred_conf']], 
            tick_labels=[f'Correct (Count: {correct_count})',f'Incorrect (Count: {incorrect_count})'])
plt.title('Prediction Confidence')
plt.ylabel('Confidence')
plt.show()

# Line graph of accuracy vs confidence
plt.figure(figsize=(8, 6))
# get accuracy for each confidence level
nb_accuracy = []
for i in range(10):
    conf = i / 10
    nb_accuracy.append(curated_1p[(curated_1p['pred_conf'] >= conf) & (curated_1p['label'] == curated_1p['pred_label'])].shape[0] 
                       / curated_1p[curated_1p['pred_conf'] >= conf].shape[0] + 0.0001)
# Plot line and points, label points
plt.plot([i / 10 for i in range(10)], nb_accuracy, marker='o', alpha=0.5)
for i, acc in enumerate(nb_accuracy):
    plt.text(i / 10, acc + .01, f'{acc:.2f}', fontsize=8, ha='center', va='bottom')
# legend
plt.title('Accuracy vs Confidence')
plt.xlabel('Confidence')
plt.ylabel('Accuracy')
plt.show()

# Top Streamers List curated to 50 streamers for active/engaged chats, or true notability
smaller_streamer_vods_dict = {
    "KaiCenat": "2369101926",
    "caseoh_": "2374536052",
    "xQc": "2372442853",
    "HasanAbi": "2373350488",
    "Jynxzi": "2370784820",
    "TimTheTatman": "2368684088",
    "yourragegaming": "2371768096",
    "loltyler1": "2305694222",
    "Emiru": "2368082972",
    "stableronaldo": "2369922252",
    "sodapoppin": "2368630449",
    "Nmplol": "2354008082",
    "Thebausffs": "2373924882",
    "tarik": "2374297065",
    "Fanum": "2371078809",
    "Jerma985": "2364495585",
    "NiceWigg": "2370111298",
    "PaymoneyWubby": "2373603842",
    "k3soju": "2374654925",
    "Ludwig": "2369047103",
    "Clix": "2373494800",
    "RDCgaming": "2371018865",
    "MOONMOON": "2374355157",
    "LIRIK": "2370616578",
    "Northernlion": "2373219071",
    "shroud": "2371796733",
    "Pokimane": "2365252668",
    "DougDoug": "2359836833",
    "Erobb221": "2353266197",
    "summit1g": "2373391595",
    "CohhCarnage": "2365882528",
    "Tenz": "2371956512",
    "Ironmouse": "2370341972",
    "CDawgVA": "2362828964",
    "supertf": "2371930154",
    "Maximilian_DOOD": "2372857427",
    "Duke": "2334535229",
    "Vinesauce": "2371038452",
    "Doublelift": "2373428024",
    "WillNeff": "2374535099",
    "Atrioc": "2365643219",
    "39Daph": "2373794511",
    "Sykkuno": "2372561637",
    "Nymn": "2371329490",
    "AvoidingThePuddle": "2373458227",
    "LilyPichu": "2368909378",
    "PirateSoftware": "2372088543",
    "ohnePixel": "2372167845",
    "Valkyrae": "2373345336",
    "Squeex": "2371706296"
}

# Download Chats
# for key in smaller_streamer_vods_dict:
#     download_chat(smaller_streamer_vods_dict[key], f"sm_curated_50/{key}.txt")

# load data
smaller_curated_data = load_data_from_directory('sm_curated_50')

# get count of each label, print all labels with count < 5000
print("Streamers with < 5000 Messages:")
label_counts = smaller_curated_data['label'].value_counts()
for label in label_counts.index:
    if label_counts[label] < 5000:
        print(f"{label}: {label_counts[label]}")

Streamers with < 5000 Messages:
Doublelift: 4918
LilyPichu: 3722

# downsize to 5000
smaller_curated_data = downsize_dataframe(smaller_curated_data, 'label', 5000)
# Preprocess text (weak)
smaller_curated_data['text'] = weak_preprocess_text(smaller_curated_data['text'])
# Encode
smaller_curated_label_mapping = {label: idx for idx, label in enumerate(smaller_curated_data['label'].unique())}
smaller_curated_data['label_encoded'] = smaller_curated_data['label'].map(smaller_curated_label_mapping)
# Vectorize with character n-grams
smaller_curated_vectorizer = TfidfVectorizer(max_features=20000, analyzer='char_wb', ngram_range=(1, 5))
smaller_curated_features = smaller_curated_vectorizer.fit_transform(smaller_curated_data['text']).toarray()
smaller_curated_labels = smaller_curated_data['label_encoded'].values
# Save Vectorizer
dump(smaller_curated_vectorizer, "sc_v.joblib")
# Split data
smaller_curated_X_train, smaller_curated_X_test, smaller_curated_y_train, smaller_curated_y_test = train_test_split(smaller_curated_features, smaller_curated_labels, test_size=0.2, random_state=42)

# NB Classifier
smaller_curated_nb_clf = MultinomialNB()
smaller_curated_nb_clf.fit(smaller_curated_X_train, smaller_curated_y_train)
# Save Model
dump(smaller_curated_nb_clf, 'smaller_curated_nb_model.joblib')
# Evaluate NB Classifier
smaller_curated_y_pred = smaller_curated_nb_clf.predict(smaller_curated_X_test)
smaller_curated_accuracy = accuracy_score(smaller_curated_y_test, smaller_curated_y_pred)
print("NB Accuracy:", smaller_curated_accuracy)

NB Accuracy: 0.3043959137709138

# load models
sc_nb = load('smaller_curated_nb_model.joblib')
sc_v = load('sc_v.joblib')

# Plot accuracy vs k for NB
# get results, accuracy for k = [1,10]
accuracies = []
for k in range(1, 21):
    results, acc = simulate_k_predictions_for_each_label(smaller_curated_data, k, sc_nb, sc_v, smaller_curated_label_mapping)
    accuracies.append(acc)

# Plot accuracy vs k TODO Plot with better models as well
plt.figure(figsize=(8, 6))
plt.plot(range(1, 21), accuracies, marker='o')
plt.title('Prediction Accuracy vs Number of Chat Messages Sampled per Streamer')
plt.xlabel('# Chats Sampled')
plt.ylabel('Accuracy')
plt.xticks(np.arange(0, 21, 2))
plt.yticks(np.arange(0, 1.1, 0.1))
plt.ylim(0, 1)
plt.show()

num_streamers = len(smaller_curated_label_mapping)
# Make matrix of streamers for cosine similarity, jaccard similarity
curated_cosine_similarity = np.zeros((num_streamers, num_streamers))
curated_jaccard_similarity = np.zeros((num_streamers, num_streamers))
# Get words list for each label based on label_encoded
# Words already preprocessed, downsized
words_list = []
for i in range(num_streamers):
    words_list.append(' '.join(smaller_curated_data[smaller_curated_data['label_encoded'] == i]['text']).split(' '))

# Get similarities for each pair
for i in range(num_streamers):
    for j in range(i+1, num_streamers):
        curated_cosine_similarity[i, j] = cosine_sim(words_list[i], words_list[j])
        curated_jaccard_similarity[i, j] = jaccard_similarity(words_list[i], words_list[j])

# Side by side boxplots of cosine similarity and jaccard similarity using subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
# Cosine Similarity
axs[0].boxplot(curated_cosine_similarity[np.triu_indices(num_streamers, k=1)])
axs[0].set_title('Cosine Similarity')
axs[0].set_ylabel('Similarity')
axs[0].set_xticks([])
# Jaccard Similarity
axs[1].boxplot(curated_jaccard_similarity[np.triu_indices(num_streamers, k=1)])
axs[1].set_title('Jaccard Similarity')
axs[1].set_xticks([])
plt.show()

# Create a graph from the cosine similarity matrix
G = nx.Graph()
# Add nodes with labels
for i in range(num_streamers):
    streamer_name = list(smaller_curated_label_mapping.keys())[list(smaller_curated_label_mapping.values()).index(i)]
    G.add_node(i, label=streamer_name)
# Add edges, for each streamer only add the k highest cosine similarity edges if similarity > threshold
k = 1
threshold = .001
for i in range(num_streamers):
    top_indices = np.argsort(curated_cosine_similarity[i])[::-1][:k]
    for j in top_indices:
        if curated_cosine_similarity[i, j] > threshold:
            cos_sim = curated_cosine_similarity[i, j]
            G.add_edge(i, j, weight=cos_sim)
# Draw the graph
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, weight='weight', seed=42, k=2, iterations=20000)
labels = nx.get_node_attributes(G, 'label')

# Nodes
partition = community.best_partition(G)
unique_communities = set(partition.values())
color_map = {com: plt.cm.tab10(i % 10) for i, com in enumerate(unique_communities)}
node_colors = [color_map[partition[n]] for n in G.nodes()]
node_sizes = [(G.degree(n) * 200) + 200 for n in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_size=node_sizes, edgecolors='black', linewidths=1.5, node_color=node_colors)

# Edges
nx.draw_networkx_edges(G, pos, alpha=1, width=2)

# Labels
texts = []
for node, (x, y) in pos.items():
    texts.append(plt.text(x, y, labels[node], fontsize=10, bbox=dict(facecolor='White', edgecolor='none', boxstyle='round,pad=0.15', alpha=.8)))

# Adjust text to avoid overlap
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'))

plt.title('Streamer Cosine Similarity Network')
plt.axis('off')
plt.show()

	label1	label2	accuracy	jaccard_sim	cosine_sim
0	nl	xqc	NaN	0.134792	0.331596
1	nl	ibai	NaN	0.028665	0.028063
2	nl	tommy	NaN	0.122396	0.494157
3	nl	jack	NaN	0.132297	0.543763
4	xqc	ibai	NaN	0.032769	0.038524
5	xqc	tommy	NaN	0.126143	0.37855
6	xqc	jack	NaN	0.131862	0.355978
7	ibai	tommy	NaN	0.025549	0.052179
8	ibai	jack	NaN	0.026887	0.036467
9	tommy	jack	NaN	0.172311	0.695233

	label1	label2	accuracy	jaccard_sim	cosine_sim
0	nl	xqc	0.836	0.134792	0.331596
1	nl	ibai	0.9355	0.028665	0.028063
2	nl	tommy	0.8245	0.122396	0.494157
3	nl	jack	0.8345	0.132297	0.543763
4	xqc	ibai	0.9275	0.032769	0.038524
5	xqc	tommy	0.8375	0.126143	0.37855
6	xqc	jack	0.856	0.131862	0.355978
7	ibai	tommy	0.9355	0.025549	0.052179
8	ibai	jack	0.9425	0.026887	0.036467
9	tommy	jack	0.7085	0.172311	0.695233

Twitch Chat Classifier¶

Background¶

Import¶

Functions¶

Download Chat¶

Get Twitch OAuth¶

Get User ID¶

Get User VODs¶

Load txts to Dataframe¶

Downsize¶

Clean Chat Messages¶

Cluster Visualization¶

Document Similarities¶

Predict Word¶

Proof of Concept - NL vs xQc¶

Download Chat¶

Preprocess¶

Word Cloud¶

Get Overall Chat Similarity¶

Classification, Basic Tf-idf¶

Classification, n-Gram Tf-idf¶

Classification, Tf-idf Character-Level¶

Less Preprocessing Classification¶

Basic¶

Character n-grams¶

KMeans Clustering¶

Evaluate Techniques¶

Examine Successful, Failed Predictions¶

Expensive Model¶

More Pairwise Examinations¶

Multiple Streamer Proof of Concept¶

Standard¶

Cheaper Parameters, Model¶

Multiple Streamer Classification¶

Top 100 English Channels¶

Curated List of Top Streamers, 1000 messages per streamer¶

Curated Further to 50 Streamers, 5000 messages per streamer¶

Chat Similarity Network¶