import pandas as pd
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
dataset for games: https://www.kaggle.com/datasets/nikatomashvili/steam-games-dataset
games_df = pd.read_csv("Dataset/games/merged_data.csv")
games_df.shape
(71700, 16)
# First we're going to set all reviews numbers, percentage and price in separate columns
games_df['Percentage_Positive_Reviews'] = games_df['All Reviews Number'].str.extract(r'(\d+)%').astype(float)
games_df['Percentage_Positive_Reviews_Recent'] = games_df['Recent Reviews Number'].str.extract(r'(\d+)%').astype(float)
games_df['Total_User_Reviews'] = games_df['All Reviews Number'].str.extract(r'([\d,]+) user reviews').replace(',', '', regex=True).astype(float)
games_df['Total_User_Reviews_Recent'] = games_df['Recent Reviews Number'].str.extract(r'([\d,]+) user reviews').replace(',', '', regex=True).astype(float)
games_df["Total_User_Reviews_Summary"] = games_df["Recent Reviews Summary"].str.extract(r'(\d+) user reviews?', expand=False).astype(float)
games_df['Price'] = games_df['Original Price'].replace('Free', '$0.00').str.replace(',', '').str.extract(r'\$(\d+\.?\d*)').astype(float)
games_df['Year_Release'] = games_df['Release Date'].str.extract(r'(20\d{2})', expand=False).astype(float)
games_df.head(3)
| Title | Original Price | Discounted Price | Release Date | Link | Game Description | Recent Reviews Summary | All Reviews Summary | Recent Reviews Number | All Reviews Number | ... | Popular Tags | Game Features | Minimum Requirements | Percentage_Positive_Reviews | Percentage_Positive_Reviews_Recent | Total_User_Reviews | Total_User_Reviews_Recent | Total_User_Reviews_Summary | Price | Year_Release | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Baldur's Gate 3 | $29.99 | $29.99 | 3 Aug, 2023 | https://store.steampowered.com/app/1086940/Bal... | Baldur’s Gate 3 is a story-rich, party-based R... | Overwhelmingly Positive | Very Positive | - 96% of the 128,900 user reviews in the last ... | - 94% of the 188,617 user reviews for this gam... | ... | ['RPG', 'Choices Matter', 'Character Customiza... | ['Single-player', 'Online Co-op', 'LAN Co-op',... | Requires a 64-bit processor and operating syst... | 94.0 | 96.0 | 188617.0 | 128900.0 | NaN | 29.99 | 2023.0 |
| 1 | Counter-Strike: Global Offensive | $14.99 | $14.99 | 21 Aug, 2012 | https://store.steampowered.com/app/730/Counter... | Counter-Strike: Global Offensive (CS: GO) expa... | Very Positive | Very Positive | - 89% of the 75,284 user reviews in the last 3... | - 88% of the 7,428,921 user reviews for this g... | ... | ['FPS', 'Shooter', 'Multiplayer', 'Competitive... | ['Steam Achievements', 'Full controller suppor... | OS: | Windows® 7/Vista/XP | Processor: | Int... | 88.0 | 89.0 | 7428921.0 | 75284.0 | NaN | 14.99 | 2012.0 |
| 2 | Apex Legends™ | Free | Free | 4 Nov, 2020 | https://store.steampowered.com/app/1172470/Ape... | Apex Legends is the award-winning, free-to-pla... | Mixed | Very Positive | - 65% of the 18,581 user reviews in the last 3... | - 80% of the 701,597 user reviews for this gam... | ... | ['Free to Play', 'Multiplayer', 'Battle Royale... | ['Online PvP', 'Online Co-op', 'Steam Achievem... | Requires a 64-bit processor and operating syst... | 80.0 | 65.0 | 701597.0 | 18581.0 | NaN | 0.00 | 2020.0 |
3 rows × 23 columns
# The dataset is updated until early september 2023, so only games with a release date of less or equal than 2021 will be kept
# Null values in Year_Release will also be removed
games_df.dropna(subset=["Year_Release"],inplace=True)
mask_2022 = games_df["Year_Release"]<=2022
games_df = games_df[mask_2022]
# From the dataframe, there's 92% missing values on all reviews
games_df.isna().sum()/len(games_df)*100
Title 0.000000 Original Price 0.000000 Discounted Price 0.000000 Release Date 0.000000 Link 0.000000 Game Description 0.269018 Recent Reviews Summary 6.013675 All Reviews Summary 91.492303 Recent Reviews Number 6.013675 All Reviews Number 91.492303 Developer 0.308250 Publisher 0.706172 Supported Languages 0.000000 Popular Tags 0.000000 Game Features 0.000000 Minimum Requirements 1.944777 Percentage_Positive_Reviews 91.492303 Percentage_Positive_Reviews_Recent 36.197878 Total_User_Reviews 91.492303 Total_User_Reviews_Recent 36.197878 Total_User_Reviews_Summary 69.815797 Price 0.000000 Year_Release 0.000000 dtype: float64
# In order to minimize the amount of missing values
# Fill missing values in 'Percentage_Positive_Reviews' with values from 'Percentage_Positive_Reviews_Recent'
games_df['Percentage_Positive_Reviews'].fillna(games_df['Percentage_Positive_Reviews_Recent'], inplace=True)
# Fill missing values in 'Total_User_Reviews' using 'Total_User_Reviews_Recent', and then 'Total_User_Reviews_Summary'
games_df['Total_User_Reviews'].fillna(games_df['Total_User_Reviews_Recent'], inplace=True)
games_df['Total_User_Reviews'].fillna(games_df['Total_User_Reviews_Summary'], inplace=True)
# Now we can drop the columns that we used for assisting so it's not confusing:
assisting_columns = ['Percentage_Positive_Reviews_Recent','Total_User_Reviews_Recent','Total_User_Reviews_Summary']
games_df.drop(assisting_columns,axis=1,inplace=True)
# check new percentage of missing values
games_df.isna().sum()/len(games_df)*100
Title 0.000000 Original Price 0.000000 Discounted Price 0.000000 Release Date 0.000000 Link 0.000000 Game Description 0.269018 Recent Reviews Summary 6.013675 All Reviews Summary 91.492303 Recent Reviews Number 6.013675 All Reviews Number 91.492303 Developer 0.308250 Publisher 0.706172 Supported Languages 0.000000 Popular Tags 0.000000 Game Features 0.000000 Minimum Requirements 1.944777 Percentage_Positive_Reviews 36.197878 Total_User_Reviews 6.013675 Price 0.000000 Year_Release 0.000000 dtype: float64
from the management of missing values, now we have only 6% of missing values in the column of Total_User_Reviews
# We intend to infer that if a game has no total user reviews, then it has 0 reviews
# However when visiting the Link for some of these games, some are well stablished while others do have 0 reviews
# The main difference relies on the Steam Link, where some have /sub/ others have /app/
empty_mask = games_df["Total_User_Reviews"].isna()
empty_df = games_df[empty_mask].reset_index(drop=True)
# We are going to sample the first 10 URLs from the different Links to verify if there's something about them
sample_10_sub = empty_df[empty_df["Link"].str.contains("/sub/")]["Link"]
sample_10_app = empty_df[empty_df["Link"].str.contains("/app/")]["Link"]
print(f"Sample 10 Links with 'sub' (from total {len(sample_10_sub)})")
for e in sample_10_sub[:10]:
print(e)
print(f"Sample 10 Links with 'app' (from total {len(sample_10_app)})")
for e in sample_10_app[:10]:
print(e)
Sample 10 Links with 'sub' (from total 133) https://store.steampowered.com/sub/124923/?snr=1_7_7_230_150_27 https://store.steampowered.com/sub/626153/?snr=1_7_7_230_150_30 https://store.steampowered.com/sub/94174/?snr=1_7_7_230_150_32 https://store.steampowered.com/sub/692569/?snr=1_7_7_230_150_34 https://store.steampowered.com/sub/199943/?snr=1_7_7_230_150_42 https://store.steampowered.com/sub/460/?snr=1_7_7_230_150_49 https://store.steampowered.com/sub/510898/?snr=1_7_7_230_150_55 https://store.steampowered.com/sub/139397/?snr=1_7_7_230_150_65 https://store.steampowered.com/sub/15373/?snr=1_7_7_230_150_66 https://store.steampowered.com/sub/320795/?snr=1_7_7_230_150_67 Sample 10 Links with 'app' (from total 3086) https://store.steampowered.com/app/60/Ricochet/?snr=1_7_7_230_150_48 https://store.steampowered.com/app/1202100/Bodyless/?snr=1_7_7_230_150_483 https://store.steampowered.com/app/870890/Museum_of_Symmetry/?snr=1_7_7_230_150_502 https://store.steampowered.com/app/984570/Chess_Sphere/?snr=1_7_7_230_150_519 https://store.steampowered.com/app/1935960/Drifters_Tales/?snr=1_7_7_230_150_579 https://store.steampowered.com/app/510000/Xecryst_Remains/?snr=1_7_7_230_150_711 https://store.steampowered.com/app/1919150/Smudge_Coin_Run/?snr=1_7_7_230_150_713 https://store.steampowered.com/app/682290/Zeus_Battlegrounds/?snr=1_7_7_230_150_728 https://store.steampowered.com/app/1444570/Fantasy_Royal_VR/?snr=1_7_7_230_150_733 https://store.steampowered.com/app/973390/EmpiresThe_Rise/?snr=1_7_7_230_150_733
after checking the urls, indeed the ones with "sub" tend to be from popular games, which include bundles that don't contain the amount of reviews for that game. On the other hand, the ones with "app" are games that actually have no reviews at all.
sub_mask = games_df["Link"].str.contains("/sub/")
print("before removing '/sub/' urls from games_df:", len(games_df))
games_df = games_df[~sub_mask]
print("after removing '/sub/' urls from games_df:", len(games_df))
before removing '/sub/' urls from games_df: 53528 after removing '/sub/' urls from games_df: 53392
# For the rest of games in the dataframe, the total amount of reviews should be 0
games_df["Total_User_Reviews"].fillna(0, inplace=True)
# Check the missing values %
games_df.isna().sum()/len(games_df)*100
Title 0.000000 Original Price 0.000000 Discounted Price 0.000000 Release Date 0.000000 Link 0.000000 Game Description 0.020602 Recent Reviews Summary 5.779892 All Reviews Summary 91.476251 Recent Reviews Number 5.779892 All Reviews Number 91.476251 Developer 0.059934 Publisher 0.548771 Supported Languages 0.000000 Popular Tags 0.000000 Game Features 0.000000 Minimum Requirements 1.700629 Percentage_Positive_Reviews 36.040980 Total_User_Reviews 0.000000 Price 0.000000 Year_Release 0.000000 dtype: float64
# current size of the dataframe
print("current size of the dataframe:",games_df.shape)
current size of the dataframe: (53392, 20)
# reset index of games_df
games_df.reset_index(drop=True,inplace=True)
warnings.filterwarnings('ignore')
# Since there's way too many outliers on the dependant variable (games with many reviews compared to the vast majority which have less reviews)
# we're going to use the Inter Quartile Range method for identifying these outliers and removing them
# Finding the IQR
percentile25 = games_df['Total_User_Reviews'].quantile(0.25)
percentile75 = games_df['Total_User_Reviews'].quantile(0.75)
iqr = percentile75 - percentile25
# Finding the upper and lower limits
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr
# Finding outliers
outliers_high = games_df[games_df['Total_User_Reviews'] > upper_limit]
outliers_low = games_df[games_df['Total_User_Reviews'] < lower_limit]
# Trimming outliers
trimmed_df = games_df[(games_df['Total_User_Reviews'] < upper_limit) & (games_df['Total_User_Reviews'] > lower_limit)]
trimmed_df.reset_index(drop=True,inplace=True)
# Compare the plots after trimming
plt.figure(figsize=(16, 8))
plt.subplot(2, 2, 1)
sns.distplot(games_df['Total_User_Reviews'])
plt.title('Original Distribution')
plt.subplot(2, 2, 2)
sns.boxplot(games_df['Total_User_Reviews'])
plt.title('Original Boxplot')
plt.subplot(2, 2, 3)
sns.distplot(trimmed_df['Total_User_Reviews'])
plt.title('Trimmed Distribution')
plt.subplot(2, 2, 4)
sns.boxplot(trimmed_df['Total_User_Reviews'])
plt.title('Trimmed Boxplot')
plt.tight_layout()
plt.show()
print("Original Dataset Statistics:")
print(f"Previous dataset size: {len(games_df)}")
print(f"mean: {games_df['Total_User_Reviews'].mean():.2f}")
print(f"max: {games_df['Total_User_Reviews'].max()}")
print(f"min: {games_df['Total_User_Reviews'].min()}")
print(f"std: {games_df['Total_User_Reviews'].std():.2f}")
print("\nTrimmed Dataset Statistics:")
print(f"Current dataset size: {len(trimmed_df)}")
print(f"mean: {trimmed_df['Total_User_Reviews'].mean():.2f}")
print(f"max: {trimmed_df['Total_User_Reviews'].max()}")
print(f"min: {trimmed_df['Total_User_Reviews'].min()}")
print(f"std: {trimmed_df['Total_User_Reviews'].std():.2f}")
# Number of outliers removed
num_outliers_removed = len(games_df) - len(trimmed_df)
print("\nNumber of outliers removed:", num_outliers_removed, f"({num_outliers_removed/len(games_df)})")
Original Dataset Statistics: Previous dataset size: 53392 mean: 1590.59 max: 7428921.0 min: 0.0 std: 38589.05 Trimmed Dataset Statistics: Current dataset size: 44516 mean: 36.90 max: 279.0 min: 0.0 std: 55.90 Number of outliers removed: 8876 (0.1662421336529817)
# The dataset statistics have changed significantly, we can now assign the trimmed dataset to games_df
games_df = trimmed_df
# Should we filter the dataset so it only includes games with the "Indie" tag?
mask_indie = games_df["Popular Tags"].str.contains("Indie")
indie_games_df = games_df[mask_indie]
print("only Indie tag dataframe size:",indie_games_df.shape)
print(games_df.shape[0]-indie_games_df.shape[0],"less games")
only Indie tag dataframe size: (29564, 20) 14952 less games
# Mask definitions (assuming mask_indie is defined)
fig, ax = plt.subplots(figsize=(10, 6)) # Create a single subplot
# Boxplot for Indie games
ax.boxplot(games_df[mask_indie]["Total_User_Reviews"].dropna(), positions=[0], vert=False)
# Boxplot for Non-Indie games
ax.boxplot(games_df[~mask_indie]["Total_User_Reviews"].dropna(), positions=[1], vert=False)
ax.set_yticklabels(['Indie', 'Non-Indie']) # Set y-axis labels
ax.set_xlabel('Total User Reviews')
ax.set_title('Comparison of Total User Reviews (Indie vs. Non-Indie games)')
# Add a note under the plot
plt.text(0.5, -0.3, "Visually, both boxplots don't show too different of a distribution",
horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)
plt.tight_layout()
plt.show()
# Output lengths
print(len(games_df[mask_indie]["Total_User_Reviews"].dropna()))
print(len(games_df[~mask_indie]["Total_User_Reviews"].dropna()))
29564 14952
It doesn't seem that indie games and non indie games (based on the tag) differ a lot in regards to the distribution of both. Hence, we choose not to create a separate category or remove non-indie games
# Check if we should do some kind of treatment for free to play games
f2p_mask = (games_df["Original Price"].str.contains("Free"))
fig, ax = plt.subplots(figsize=(10, 6)) # Create a single subplot
# Boxplot for Non-F2P games
ax.boxplot(games_df[~f2p_mask]["Total_User_Reviews"].dropna(), positions=[0], vert=False)
# Boxplot for F2P games
ax.boxplot(games_df[f2p_mask]["Total_User_Reviews"].dropna(), positions=[1], vert=False)
ax.set_yticklabels(['Non-F2P', 'F2P']) # Set y-axis labels
ax.set_xlabel('Total User Reviews')
ax.set_title('Comparison of Total User Reviews (F2P vs. Non-F2P games)')
# Add a note under the plot
plt.text(0.5, -0.3, "Based on the boxplots, Non-F2P games have much less user reviews when compared to F2P games",
horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)
plt.tight_layout()
plt.show()
# Because the distributions differ a lot foro F2P and non-FTP games, we'll create a new variable that differentiates them
games_df["F2P"] = games_df["Original Price"].str.contains("Free").astype(int)
Analyis and removal of outliers for price of games
# IQR method will be used again to see what values can we keep
warnings.filterwarnings('ignore')
# Since there's way too many outliers on the dependant variable, we're going to use the Inter Quartile Range method
# For identifying and removing them
# Finding the IQR
percentile25 = games_df['Price'].quantile(0.25)
percentile75 = games_df['Price'].quantile(0.75)
iqr = percentile75 - percentile25
# Finding the upper and lower limits
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr
# Finding outliers
outliers_high = games_df[games_df['Price'] > upper_limit]
outliers_low = games_df[games_df['Price'] < lower_limit]
# Trimming outliers
trimmed_df = games_df[(games_df['Price'] < upper_limit) & (games_df['Price'] > lower_limit)]
trimmed_df.reset_index(drop=True,inplace=True)
# Compare the plots after trimming
plt.figure(figsize=(16, 8))
plt.subplot(2, 2, 1)
sns.distplot(games_df['Price'])
plt.title('Original Distribution')
plt.subplot(2, 2, 2)
sns.boxplot(games_df['Price'])
plt.title('Original Boxplot')
plt.subplot(2, 2, 3)
sns.distplot(trimmed_df['Price'])
plt.title('Trimmed Distribution')
plt.subplot(2, 2, 4)
sns.boxplot(trimmed_df['Price'])
plt.title('Trimmed Boxplot')
plt.tight_layout()
plt.show()
print("Original Dataset Statistics:")
print(f"Previous dataset size: {len(games_df)}")
print(f"mean: {games_df['Price'].mean():.2f}")
print(f"max: {games_df['Price'].max()}")
print(f"min: {games_df['Price'].min()}")
print(f"std: {games_df['Price'].std():.2f}")
print("\nTrimmed Dataset Statistics:")
print(f"Current dataset size: {len(trimmed_df)}")
print(f"mean: {trimmed_df['Price'].mean():.2f}")
print(f"max: {trimmed_df['Price'].max()}")
print(f"min: {trimmed_df['Price'].min()}")
print(f"std: {trimmed_df['Price'].std():.2f}")
# Number of outliers removed
num_outliers_removed = len(games_df) - len(trimmed_df)
print("\nNumber of outliers removed:", num_outliers_removed,f"({num_outliers_removed/len(games_df)})")
Original Dataset Statistics: Previous dataset size: 44516 mean: 952.08 max: 14046333.0 min: 0.0 std: 115306.99 Trimmed Dataset Statistics: Current dataset size: 43303 mean: 3.78 max: 13.57 min: 0.0 std: 3.02 Number of outliers removed: 1213 (0.02724862970617306)
# Let's assign games_df to the new trimmed dataset
games_df = trimmed_df
Analysis and transformation of Game Features into dummy variables
game_feat = " ".join(games_df["Game Features"].astype(str))
game_feat = game_feat.replace("[",",").replace("]",",")
game_feat_list = [el.strip() for el in game_feat.split(",")]
game_feat_pd = pd.DataFrame(game_feat_list)
value_counts_gf = game_feat_pd.value_counts()
print(len(value_counts_gf))
33
# Plotting the horizontal bar chart
plt.figure(figsize=(10, 8))
ax = value_counts_gf.plot(kind='barh') # Create the bar plot
plt.title('Game Features')
plt.xlabel('Counts')
plt.ylabel('Categories')
plt.gca().invert_yaxis() # Invert y-axis to display the highest count at the top
plt.ticklabel_format(style='plain', axis='x') # Disable scientific notation on x-axis
# Adding text annotations for values at the end of the bars
for i, v in enumerate(value_counts_gf):
ax.text(v + 3, i, str(v), color='black', va='center') # Adjust the "+3" offset for text placement
plt.tight_layout()
plt.show()
# based on the value counts of "Game Features", we are going to keep all of the features that
# have a frequency higher than 4 and remove the empty values
value_counts_gf = value_counts_gf[value_counts_gf>4][1:]
value_counts_gf
'Single-player' 41738 'Steam Achievements' 20277 'Steam Cloud' 9134 'Full controller support' 8824 'Partial Controller Support' 5871 'Steam Trading Cards' 4904 'Steam Leaderboards' 4189 'Remote Play Together' 3679 'Tracked Controller Support' 3549 'VR Only' 3218 'Online PvP' 3035 'Shared/Split Screen PvP' 2537 'Stats' 2164 'Shared/Split Screen Co-op' 1901 'Online Co-op' 1678 'Cross-Platform Multiplayer' 1042 'Includes level editor' 992 'Remote Play on TV' 867 'Steam Workshop' 712 'Captions available' 669 'In-App Purchases' 596 'VR Supported' 521 'LAN PvP' 341 'MMO' 307 'LAN Co-op' 278 'Remote Play on Phone' 195 'Remote Play on Tablet' 192 'Commentary available' 115 'Steam Turn Notifications' 53 'Includes Source SDK' 15 'SteamVR Collectibles' 8 dtype: int64
value_counts_gf.shape
(31,)
games_df.reset_index(drop=True,inplace=True)
# Creating the dummy variables for game_features
game_features_list = [el[0].replace("'","") for el in value_counts_gf.index]
for feature in game_features_list:
games_df[feature] = 0
for i in range(len(games_df)):
for feature in game_features_list:
if feature in games_df["Game Features"][i]:
games_df.loc[i, feature] = 1
games_df.head(2)
| Title | Original Price | Discounted Price | Release Date | Link | Game Description | Recent Reviews Summary | All Reviews Summary | Recent Reviews Number | All Reviews Number | ... | VR Supported | LAN PvP | MMO | LAN Co-op | Remote Play on Phone | Remote Play on Tablet | Commentary available | Steam Turn Notifications | Includes Source SDK | SteamVR Collectibles | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Not Tonight 2 | $10.49 | $4.72 | 11 Feb, 2022 | https://store.steampowered.com/app/1600370/Not... | Immigration Enforcement Case #112: You are und... | Very Positive | NaN | - 86% of the 235 user reviews for this game ar... | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | Forest Camp Story | $5.79 | $5.79 | 27 Nov, 2022 | https://store.steampowered.com/app/1983690/For... | Create and customize your own campground in th... | Very Positive | NaN | - 90% of the 85 user reviews for this game are... | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 52 columns
Analysis and transformation of popular tags and summarization through Principal Component Analysis
# Let's see how many tags are we dealing with
all_tags = " ".join(games_df["Popular Tags"].astype(str))
all_tags = all_tags.replace("[",",").replace("]",",")
all_tags_list = [el.strip() for el in all_tags.split(",")]
all_tags_pd = pd.DataFrame(all_tags_list)
value_counts = all_tags_pd.value_counts()
print(len(value_counts))
value_counts[value_counts>0]
441
43312
'Indie' 29096
'Singleplayer' 21521
'Casual' 20822
'Action' 19572
...
'Shop Keeper' 4
'Fox' 4
'TrackIR' 4
'Reboot' 3
'Birds' 1
Length: 441, dtype: int64
# Convert the string representations of lists to actual lists
games_df['Popular Tags'] = games_df['Popular Tags'].apply(lambda x: ast.literal_eval(x))
# Apply get_dummies directly on the 'Popular Tags' column
tags_dummies_df = games_df['Popular Tags'].str.join('|').str.get_dummies()
tags_dummies_df
| 1980s | 1990's | 2.5D | 2D | 2D Fighter | 2D Platformer | 360 Video | 3D | 3D Fighter | 3D Platformer | ... | Well-Written | Werewolves | Western | Wholesome | Word Game | World War I | World War II | Wrestling | Zombies | eSports | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 43298 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 43299 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 43300 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 43301 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 43302 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
43303 rows × 440 columns
because there's too many features, we're going to use feature reudction PCA. source: https://mikulskibartosz.name/pca-how-to-choose-the-number-of-components
import numpy as np
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(tags_dummies_df)
# Calculate cumulative explained variance ratio
cumulative_var = np.cumsum(pca.explained_variance_ratio_)
# Plot the cumulative explained variance ratio
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(cumulative_var) + 1), cumulative_var, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance Ratio by Number of Components')
plt.grid(True)
threshold = 0.95 # Set the threshold here
plt.axhline(y=threshold, color='r', linestyle='--', label=f'{threshold*100}% Threshold')
plt.legend()
plt.show()
# Find the number of components that achieve the desired threshold
num_components = np.argmax(cumulative_var >= threshold) + 1
print(f"Number of components needed to achieve {threshold*100}% threshold: {num_components}")
Number of components needed to achieve 95.0% threshold: 238
# Since there is way too many variables and we would like for the user to be able to input as many as they want
# We'll apply Principal Component Analysis (PCA) to summarize the tags so that the predictive model can handle them
# more efficiently
from sklearn.decomposition import PCA
pca = PCA(n_components=num_components) # Specify the desired number of components based on the previous plot
pca.fit(tags_dummies_df)
# Transform the data using PCA
df_pca = pd.DataFrame(pca.transform(tags_dummies_df), columns=[f'PCA_{i}' for i in range(pca.n_components_)])
df_pca.head()
| PCA_0 | PCA_1 | PCA_2 | PCA_3 | PCA_4 | PCA_5 | PCA_6 | PCA_7 | PCA_8 | PCA_9 | ... | PCA_228 | PCA_229 | PCA_230 | PCA_231 | PCA_232 | PCA_233 | PCA_234 | PCA_235 | PCA_236 | PCA_237 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.409657 | -0.324480 | 0.591374 | -0.847256 | 0.609709 | 0.003414 | -0.087016 | 0.017850 | 0.658866 | -0.022729 | ... | 0.117069 | -0.097110 | -0.206132 | 0.277560 | 0.127647 | 0.162974 | -0.130298 | 0.278489 | 0.072791 | 0.065839 |
| 1 | 0.694280 | -0.731256 | -0.804323 | -0.131967 | 0.679628 | -0.766924 | 0.381220 | 0.242563 | 0.913800 | -0.630273 | ... | 0.043569 | -0.015040 | -0.044558 | 0.030819 | -0.001442 | 0.061336 | -0.061068 | 0.025783 | 0.030147 | -0.007183 |
| 2 | 0.803688 | -0.764302 | -0.712482 | -0.142711 | 0.702741 | -0.562767 | 0.513145 | 0.434236 | 1.110108 | -0.440919 | ... | 0.050786 | -0.036438 | -0.089431 | -0.067511 | 0.011603 | -0.057506 | -0.060196 | 0.002193 | -0.014653 | 0.018034 |
| 3 | 0.929491 | -0.932884 | -0.726056 | -0.114928 | 0.522740 | -0.842807 | 0.088108 | -0.060701 | 0.774992 | -0.580296 | ... | 0.125382 | 0.052405 | -0.023821 | 0.057092 | 0.020841 | 0.081334 | -0.024063 | 0.068355 | -0.003734 | -0.024518 |
| 4 | -0.270744 | 0.983164 | 0.345343 | 0.082186 | -0.110755 | -0.321691 | 0.829532 | -0.277355 | 0.007374 | 0.067947 | ... | 0.462584 | -0.067987 | -0.153376 | 0.059805 | -0.254568 | 0.153198 | -0.199089 | 0.094935 | -0.468095 | -0.160779 |
5 rows × 238 columns
# Save the PCA components to use on the website
np.savetxt('pca_components.csv', pca.components_, delimiter=',')
# Concatenate the PCA features into the games_df dataframe
games_df = pd.concat([games_df, df_pca], axis=1)
games_df.head(1)
| Title | Original Price | Discounted Price | Release Date | Link | Game Description | Recent Reviews Summary | All Reviews Summary | Recent Reviews Number | All Reviews Number | ... | PCA_228 | PCA_229 | PCA_230 | PCA_231 | PCA_232 | PCA_233 | PCA_234 | PCA_235 | PCA_236 | PCA_237 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Not Tonight 2 | $10.49 | $4.72 | 11 Feb, 2022 | https://store.steampowered.com/app/1600370/Not... | Immigration Enforcement Case #112: You are und... | Very Positive | NaN | - 86% of the 235 user reviews for this game ar... | NaN | ... | 0.117069 | -0.09711 | -0.206132 | 0.27756 | 0.127647 | 0.162974 | -0.130298 | 0.278489 | 0.072791 | 0.065839 |
1 rows × 290 columns
remove columns that won't be used, add new columns, etc
# only going to include top 10 languages (Excluding english, since it's supported for most games)
languages = ["German","French","Russian","Spanish - Spain","Simplified Chinese","Japanese","Portuguese - Brazil","Korean"]
for l in languages:
games_df[l] = 0
games_df.head(2)
| Title | Original Price | Discounted Price | Release Date | Link | Game Description | Recent Reviews Summary | All Reviews Summary | Recent Reviews Number | All Reviews Number | ... | PCA_236 | PCA_237 | German | French | Russian | Spanish - Spain | Simplified Chinese | Japanese | Portuguese - Brazil | Korean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Not Tonight 2 | $10.49 | $4.72 | 11 Feb, 2022 | https://store.steampowered.com/app/1600370/Not... | Immigration Enforcement Case #112: You are und... | Very Positive | NaN | - 86% of the 235 user reviews for this game ar... | NaN | ... | 0.072791 | 0.065839 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | Forest Camp Story | $5.79 | $5.79 | 27 Nov, 2022 | https://store.steampowered.com/app/1983690/For... | Create and customize your own campground in th... | Very Positive | NaN | - 90% of the 85 user reviews for this game are... | NaN | ... | 0.030147 | -0.007183 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 298 columns
for i in range(len(games_df)):
for l in languages:
if l in games_df["Supported Languages"][i]:
games_df.loc[i,l] = 1
# After implementing languages, we're going to remove other columns that won't be used
remove_cols = ["Title","Original Price","Discounted Price","Release Date","Link","Game Description","Recent Reviews Summary",
"All Reviews Summary","Recent Reviews Number","All Reviews Number","Developer","Publisher","Supported Languages",
"Popular Tags","Game Features","Minimum Requirements","Year_Release","Percentage_Positive_Reviews"]
clean_games_df = games_df.copy()
clean_games_df.drop(remove_cols,axis=1,inplace=True)
# Now we're going to remove rows with missing values
clean_games_df.dropna(inplace=True)
print("Shape of dataset before formatting:",games_df.shape)
print("Shape of dataset after formatting:",clean_games_df.shape)
Shape of dataset before formatting: (43303, 298) Shape of dataset after formatting: (43303, 280)
# transform the column names so there is no strange characters
clean_games_df.columns = [col.replace(" ","_").replace("-","_").replace("/","_") for col in clean_games_df.columns]
min_price = 0
# Setting up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Plotting Total_User_Reviews vs. Price > min_price on the first subplot
sns.regplot(
ax=axes[0],
x=clean_games_df[clean_games_df["Price"] > min_price]["Price"],
y=clean_games_df[clean_games_df["Price"] > min_price]["Total_User_Reviews"],
lowess=True,
line_kws={'color': 'black'}
)
axes[0].set_title(f"Total_User_Reviews vs. Price > {min_price}")
# Plotting Total_User_Reviews vs. Price on the second subplot
sns.regplot(
ax=axes[1],
x=clean_games_df["Price"],
y=clean_games_df["Total_User_Reviews"],
lowess=True,
line_kws={'color': 'black'}
)
axes[1].set_title("Total_User_Reviews vs. Price")
# Displaying the plots side by side
plt.tight_layout()
plt.show()
Just as shown in the previous analysis of F2P games, even if mostly linear, there is higher amount of reviews present in games with price 0 (F2P). It's with this in mind that the use of the additional variable F2P should mitigate this effect on the model.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
# Splitting the data into features (X) and target variable (y)
X = clean_games_df.drop('Total_User_Reviews', axis=1) # Features
y = clean_games_df['Total_User_Reviews'] # Target variable
# Performing train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Training a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
# Making predictions on the test set
y_pred = linear_model.predict(X_test)
# Calculating evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mask = y_test != 0
mape = (np.mean(np.abs((y_test[mask] - y_pred[mask]) / y_test[mask])) * 100) if np.any(mask) else np.nan
# Displaying performance metrics
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
# Calculate Standard Deviation of Errors
std_error = np.std(y_test - y_pred)
print(f"Standard Deviation of Errors: {std_error:.2f}")
Mean Squared Error (MSE): 2273.12 R-squared (R2): 0.2750 Mean Absolute Error (MAE): 30.98 Mean Absolute Percentage Error (MAPE): 361.34% Root Mean Squared Error (RMSE): 47.68 Standard Deviation of Errors: 47.67
# Extracting coefficients and intercept
coefficients = linear_model.coef_
intercept = linear_model.intercept_
# Optionally, save to a file
with open("model_coefficients.txt", "w") as file:
file.write(f"Coefficients: {coefficients.tolist()}\n")
file.write(f"Intercept: {intercept}\n")
# Get the coefficients and corresponding feature names
coefficients = linear_model.coef_
feature_names = X_train.columns # Assuming these are the feature names used in the model
# Create a DataFrame to associate coefficients with feature names
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
# Get the top 10 most influential coefficients by absolute value
top_10_coefficients = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)[:10]
print("Top 10 Most Influential Coefficients:")
print(top_10_coefficients)
Top 10 Most Influential Coefficients:
Feature Coefficient
32 SteamVR_Collectibles 51.097263
7 Steam_Trading_Cards 39.937248
1 F2P 26.723267
28 Remote_Play_on_Tablet 22.446627
27 Remote_Play_on_Phone -18.116847
198 PCA_165 17.884798
22 In_App_Purchases 15.187831
264 PCA_231 -14.612801
25 MMO 14.597627
200 PCA_167 13.926585
# Step 1: Extract the PCA component loadings
component_number = int(input("Query PCA component number:")) # Change this to the desired component number
component_loadings = pca.components_[component_number]
# Step 2: Identify the highest loadings (positive or negative)
# You can use the absolute values of loadings to find the most related categories
absolute_loadings = component_loadings
# Sort the indices of loadings in descending order
sorted_indices = absolute_loadings.argsort()[::-1]
# Retrieve the feature names or category names based on the sorted indices
feature_names = list(tags_dummies_df.columns) # Replace with your feature DataFrame
related_categories = [feature_names[i] for i in sorted_indices]
# Set the number of top categories you want to display
top_n = 5 # Change this to the desired number
# Print the top N categories with their loading values
print(f"Top {top_n} Positively Associated Categories for PCA Component {component_number}:")
for i in range(top_n):
category = feature_names[sorted_indices[i]]
loading_value = component_loadings[sorted_indices[i]]
print(f"{category}: {loading_value:.4f}")
Query PCA component number:165 Top 5 Positively Associated Categories for PCA Component 165: Classic: 0.5705 Collectathon: 0.1762 Destruction: 0.1672 Modern: 0.1377 Detective: 0.1299
Based on the Regression Results, all the variables in the dataset (assuming no multicolinearity and linearity of independant variables) the model is able to explain 27% of the variance related to the dependant variable (From Adjusted R-Squared).
Provide additional insights to the users related to the average percentage of positive reviews
remove_cols = ["Title","Original Price","Discounted Price","Release Date","Link","Game Description","Recent Reviews Summary",
"All Reviews Summary","Recent Reviews Number","All Reviews Number","Developer","Publisher","Supported Languages",
"Popular Tags","Game Features","Minimum Requirements","Year_Release"]
reviews_df = games_df.copy()
reviews_df.drop(remove_cols,axis=1,inplace=True)
# Now we're going to remove rows with missing values
reviews_df.dropna(inplace=True)
print("Shape of dataset before formatting:",games_df.shape)
print("Shape of dataset after formatting:",reviews_df.shape)
Shape of dataset before formatting: (43303, 298) Shape of dataset after formatting: (24357, 281)
perc_reviews = reviews_df["Percentage_Positive_Reviews"]
reviews_df.drop("Percentage_Positive_Reviews",axis=1,inplace=True)
print("Minimum amount of reviews for the game to have a %:", reviews_df["Total_User_Reviews"].min())
Minimum amount of reviews for the game to have a %: 10.0
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Calculate the sum of squared distances for a range of cluster numbers
sse = []
for k in range(1, 10):
kmeans = KMeans(n_clusters=k)
kmeans.fit(reviews_df)
sse.append(kmeans.inertia_)
# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 10), sse, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Squared Distances (SSE)')
plt.grid(True)
plt.show()
pd.set_option('display.max_columns', None)
reviews_df.iloc[:,:280].head()
| Total_User_Reviews | Price | F2P | Single-player | Steam Achievements | Steam Cloud | Full controller support | Partial Controller Support | Steam Trading Cards | Steam Leaderboards | Remote Play Together | Tracked Controller Support | VR Only | Online PvP | Shared/Split Screen PvP | Stats | Shared/Split Screen Co-op | Online Co-op | Cross-Platform Multiplayer | Includes level editor | Remote Play on TV | Steam Workshop | Captions available | In-App Purchases | VR Supported | LAN PvP | MMO | LAN Co-op | Remote Play on Phone | Remote Play on Tablet | Commentary available | Steam Turn Notifications | Includes Source SDK | SteamVR Collectibles | PCA_0 | PCA_1 | PCA_2 | PCA_3 | PCA_4 | PCA_5 | PCA_6 | PCA_7 | PCA_8 | PCA_9 | PCA_10 | PCA_11 | PCA_12 | PCA_13 | PCA_14 | PCA_15 | PCA_16 | PCA_17 | PCA_18 | PCA_19 | PCA_20 | PCA_21 | PCA_22 | PCA_23 | PCA_24 | PCA_25 | PCA_26 | PCA_27 | PCA_28 | PCA_29 | PCA_30 | PCA_31 | PCA_32 | PCA_33 | PCA_34 | PCA_35 | PCA_36 | PCA_37 | PCA_38 | PCA_39 | PCA_40 | PCA_41 | PCA_42 | PCA_43 | PCA_44 | PCA_45 | PCA_46 | PCA_47 | PCA_48 | PCA_49 | PCA_50 | PCA_51 | PCA_52 | PCA_53 | PCA_54 | PCA_55 | PCA_56 | PCA_57 | PCA_58 | PCA_59 | PCA_60 | PCA_61 | PCA_62 | PCA_63 | PCA_64 | PCA_65 | PCA_66 | PCA_67 | PCA_68 | PCA_69 | PCA_70 | PCA_71 | PCA_72 | PCA_73 | PCA_74 | PCA_75 | PCA_76 | PCA_77 | PCA_78 | PCA_79 | PCA_80 | PCA_81 | PCA_82 | PCA_83 | PCA_84 | PCA_85 | PCA_86 | PCA_87 | PCA_88 | PCA_89 | PCA_90 | PCA_91 | PCA_92 | PCA_93 | PCA_94 | PCA_95 | PCA_96 | PCA_97 | PCA_98 | PCA_99 | PCA_100 | PCA_101 | PCA_102 | PCA_103 | PCA_104 | PCA_105 | PCA_106 | PCA_107 | PCA_108 | PCA_109 | PCA_110 | PCA_111 | PCA_112 | PCA_113 | PCA_114 | PCA_115 | PCA_116 | PCA_117 | PCA_118 | PCA_119 | PCA_120 | PCA_121 | PCA_122 | PCA_123 | PCA_124 | PCA_125 | PCA_126 | PCA_127 | PCA_128 | PCA_129 | PCA_130 | PCA_131 | PCA_132 | PCA_133 | PCA_134 | PCA_135 | PCA_136 | PCA_137 | PCA_138 | PCA_139 | PCA_140 | PCA_141 | PCA_142 | PCA_143 | PCA_144 | PCA_145 | PCA_146 | PCA_147 | PCA_148 | PCA_149 | PCA_150 | PCA_151 | PCA_152 | PCA_153 | PCA_154 | PCA_155 | PCA_156 | PCA_157 | PCA_158 | PCA_159 | PCA_160 | PCA_161 | PCA_162 | PCA_163 | PCA_164 | PCA_165 | PCA_166 | PCA_167 | PCA_168 | PCA_169 | PCA_170 | PCA_171 | PCA_172 | PCA_173 | PCA_174 | PCA_175 | PCA_176 | PCA_177 | PCA_178 | PCA_179 | PCA_180 | PCA_181 | PCA_182 | PCA_183 | PCA_184 | PCA_185 | PCA_186 | PCA_187 | PCA_188 | PCA_189 | PCA_190 | PCA_191 | PCA_192 | PCA_193 | PCA_194 | PCA_195 | PCA_196 | PCA_197 | PCA_198 | PCA_199 | PCA_200 | PCA_201 | PCA_202 | PCA_203 | PCA_204 | PCA_205 | PCA_206 | PCA_207 | PCA_208 | PCA_209 | PCA_210 | PCA_211 | PCA_212 | PCA_213 | PCA_214 | PCA_215 | PCA_216 | PCA_217 | PCA_218 | PCA_219 | PCA_220 | PCA_221 | PCA_222 | PCA_223 | PCA_224 | PCA_225 | PCA_226 | PCA_227 | PCA_228 | PCA_229 | PCA_230 | PCA_231 | PCA_232 | PCA_233 | PCA_234 | PCA_235 | PCA_236 | PCA_237 | German | French | Russian | Spanish - Spain | Simplified Chinese | Japanese | Portuguese - Brazil | Korean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 235.0 | 10.49 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.409657 | -0.324480 | 0.591374 | -0.847256 | 0.609709 | 0.003414 | -0.087016 | 0.017850 | 0.658866 | -0.022729 | 0.422741 | -0.736449 | 0.024179 | 0.332598 | -0.155874 | -0.082455 | 0.403533 | 0.258488 | -0.617579 | -0.035172 | -0.242397 | -0.504659 | -0.758944 | 0.377408 | 0.256094 | 0.754386 | 0.141912 | 0.172213 | -0.530914 | -0.375347 | -0.317421 | 0.063041 | -0.157411 | -0.033311 | 0.488900 | -0.153821 | 0.013936 | -0.088348 | -0.100366 | 0.199055 | -0.040522 | 0.397522 | -0.021696 | -0.343000 | 0.140135 | 0.006366 | -0.072245 | -0.102702 | -0.030086 | -0.275548 | 0.358312 | 0.082344 | 0.063584 | -0.016916 | 0.228669 | 0.180594 | 0.040635 | -0.180290 | 0.038063 | -0.207103 | -0.554190 | 0.576845 | 0.204225 | 0.203906 | -0.014385 | 0.240292 | 0.263264 | -0.097004 | -0.000072 | 0.090535 | -0.051816 | -0.000427 | -0.206293 | -0.322158 | 0.047666 | 0.067689 | -0.284267 | 0.152211 | -0.073938 | 0.049885 | 0.135372 | -0.031453 | -0.148448 | 0.353353 | 0.083490 | -0.234919 | 0.170601 | 0.211671 | 0.124677 | 0.215157 | 0.096982 | 0.001342 | -0.375367 | -0.294239 | 0.329806 | 0.611630 | 0.403976 | 0.866488 | 0.016706 | -0.319775 | 0.049421 | 0.193033 | 0.073478 | -0.335907 | 0.145451 | 0.187441 | -0.107615 | -0.152358 | -0.059214 | -0.122056 | -0.036222 | 0.059762 | -0.046586 | 0.078459 | -0.088756 | -0.071529 | 0.233146 | 0.217579 | 0.193820 | 0.172144 | -0.216722 | -0.261950 | 0.041870 | -0.066101 | 0.160970 | -0.161009 | -0.093230 | 0.009910 | 0.267075 | 0.025416 | 0.389662 | -0.270292 | -0.269544 | -0.132789 | 0.198340 | -0.156007 | -0.299557 | 0.256770 | -0.257219 | 0.011908 | -0.522128 | -0.000429 | -0.232961 | 0.160617 | 0.170392 | -0.092830 | -0.146597 | 0.227432 | -0.078216 | 0.137583 | -0.140770 | -0.193893 | 0.063626 | 0.120977 | -0.062667 | 0.013888 | -0.193499 | -0.024368 | -0.103566 | 0.104873 | 0.097802 | -0.303384 | 0.144505 | 0.220896 | -0.017473 | 0.165641 | -0.263065 | -0.206114 | 0.444111 | -0.432401 | -0.354680 | -0.293027 | 0.354977 | 0.074068 | 0.326064 | 0.150394 | 0.265315 | -0.353465 | 0.328251 | 0.315179 | 0.508689 | -0.196024 | 0.211125 | -0.411297 | -0.165329 | 0.435495 | 0.211316 | -0.069842 | -0.024485 | -0.164164 | 0.208819 | -0.229696 | 0.109706 | 0.212416 | -0.021532 | 0.175783 | -0.057621 | 0.319950 | 0.247179 | 0.383641 | 0.315966 | -0.248953 | -0.239320 | 0.048829 | 0.062792 | -0.067516 | -0.144538 | -0.022905 | 0.014889 | -0.057549 | 0.044598 | -0.234102 | -0.093890 | -0.264132 | -0.004309 | -0.112905 | -0.133020 | 0.066963 | 0.043303 | 0.038266 | -0.225912 | 0.275565 | 0.242087 | -0.153624 | -0.041829 | 0.113284 | -0.155247 | 0.297862 | 0.117069 | -0.097110 | -0.206132 | 0.277560 | 0.127647 | 0.162974 | -0.130298 | 0.278489 | 0.072791 | 0.065839 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 |
| 1 | 85.0 | 5.79 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.694280 | -0.731256 | -0.804323 | -0.131967 | 0.679628 | -0.766924 | 0.381220 | 0.242563 | 0.913800 | -0.630273 | 0.216925 | -0.047639 | 0.139831 | -0.049149 | 0.196698 | -0.567477 | -0.272681 | 0.573347 | -0.051737 | -0.585852 | -0.204863 | -0.114170 | 0.537252 | -0.007251 | -0.028970 | 0.068364 | -0.201951 | -0.078865 | -0.151912 | -0.116429 | -0.304233 | 0.278998 | 0.419693 | -0.056275 | 0.017360 | -0.077931 | 0.126280 | 0.287068 | 0.532625 | 0.106428 | -0.000206 | 0.008367 | -0.279171 | 0.119689 | -0.021284 | -0.190269 | -0.129869 | 0.139435 | 0.205165 | -0.036536 | 0.156708 | 0.337531 | -0.001929 | -0.262832 | 0.055594 | -0.065271 | 0.001902 | 0.154905 | 0.072123 | -0.157411 | -0.138491 | -0.066429 | 0.096890 | 0.042244 | -0.085258 | 0.214388 | -0.072919 | -0.277029 | -0.060948 | -0.204603 | 0.135083 | 0.079547 | 0.077536 | 0.226930 | -0.060401 | 0.009168 | -0.015340 | -0.071728 | 0.111302 | -0.103752 | 0.017916 | 0.071730 | 0.041618 | -0.131824 | -0.020318 | 0.093251 | 0.013369 | -0.143661 | 0.191193 | -0.043753 | 0.343006 | 0.194677 | -0.131906 | 0.106923 | 0.063810 | 0.513206 | -0.116143 | -0.530287 | 0.292702 | 0.259705 | -0.032076 | 0.024206 | -0.198518 | -0.240365 | 0.295485 | 0.127039 | 0.033787 | 0.218871 | -0.048416 | 0.083146 | -0.031827 | -0.013676 | 0.137829 | 0.138090 | 0.233207 | 0.079077 | -0.000805 | 0.022617 | -0.131520 | 0.142971 | -0.018555 | 0.193114 | -0.031982 | 0.092506 | -0.181376 | -0.174573 | -0.081409 | 0.041785 | 0.015361 | 0.193292 | 0.310046 | -0.075045 | -0.249411 | 0.229453 | -0.102497 | -0.259892 | -0.058988 | -0.092342 | -0.326359 | -0.263733 | 0.168374 | -0.151889 | 0.095384 | -0.307994 | -0.052884 | -0.044226 | 0.035100 | -0.018696 | -0.032355 | -0.065669 | -0.137731 | -0.004996 | 0.019167 | 0.077318 | -0.012581 | 0.080013 | -0.271913 | 0.274094 | -0.029200 | 0.222342 | 0.007607 | -0.431789 | 0.252465 | 0.414445 | -0.158658 | -0.018193 | -0.629965 | -0.184557 | 0.058902 | 0.142436 | 0.004293 | -0.035554 | -0.023122 | 0.043914 | -0.015136 | 0.058832 | -0.060033 | -0.046239 | 0.059668 | 0.037756 | -0.104149 | -0.017250 | 0.089835 | -0.090574 | 0.013425 | 0.023991 | 0.095297 | -0.035234 | -0.116351 | 0.008288 | -0.025400 | 0.027516 | 0.020887 | 0.106122 | 0.060141 | 0.203445 | -0.324235 | -0.205319 | -0.206892 | 0.173966 | -0.152762 | -0.258100 | 0.195660 | 0.054311 | -0.084749 | -0.149885 | 0.271648 | 0.231872 | -0.141589 | -0.147036 | -0.002746 | 0.170092 | 0.011895 | -0.074583 | 0.051157 | -0.029075 | -0.023004 | 0.070420 | 0.165893 | 0.045131 | 0.179958 | 0.119888 | 0.116236 | 0.025882 | -0.040671 | 0.092886 | -0.093597 | 0.047767 | 0.043569 | -0.015040 | -0.044558 | 0.030819 | -0.001442 | 0.061336 | -0.061068 | 0.025783 | 0.030147 | -0.007183 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 1 |
| 2 | 225.0 | 5.79 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.803688 | -0.764302 | -0.712482 | -0.142711 | 0.702741 | -0.562767 | 0.513145 | 0.434236 | 1.110108 | -0.440919 | 0.416820 | 0.375817 | 0.131907 | -0.247215 | 0.344559 | -0.291005 | -0.165455 | 0.716817 | -0.282792 | -0.336283 | -0.066810 | 0.058990 | 0.224290 | -0.224325 | 0.000569 | 0.005608 | -0.239348 | 0.112613 | -0.257442 | -0.299611 | -0.479394 | 0.412209 | 0.114827 | 0.018720 | -0.037424 | 0.092506 | 0.045463 | 0.147886 | 0.418028 | -0.122592 | -0.118099 | 0.164522 | -0.220075 | 0.082377 | 0.155993 | -0.110947 | 0.074377 | 0.048933 | 0.146834 | -0.026115 | -0.032453 | 0.474082 | 0.364122 | 0.407795 | -0.177835 | 0.311574 | 0.583603 | 0.245423 | 0.136955 | -0.394393 | -0.089323 | -0.086022 | -0.436033 | 0.245131 | -0.159452 | -0.161680 | 0.139590 | -0.291059 | -0.135516 | 0.339074 | 0.092451 | -0.029702 | -0.034609 | 0.332388 | -0.078524 | -0.018802 | -0.144803 | -0.051464 | 0.037284 | 0.231912 | 0.219138 | -0.171241 | 0.210746 | -0.400567 | 0.485344 | -0.331857 | -0.006506 | 0.140256 | 0.141687 | 0.204173 | 0.388371 | 0.058433 | 0.051556 | -0.157573 | 0.356370 | 0.186920 | 0.037566 | 0.431225 | 0.196193 | 0.075336 | 0.622562 | 0.150917 | 0.049859 | -0.309979 | 0.174709 | 0.182294 | -0.337023 | -0.329030 | -0.058266 | 0.312668 | 0.171151 | -0.238385 | -0.277395 | 0.154660 | 0.607164 | -0.176054 | -0.029586 | 0.330116 | -0.625818 | 0.595745 | -0.224860 | 0.183875 | 0.047668 | 0.258280 | -0.589725 | -0.330339 | -0.076033 | 0.212671 | 0.049748 | -0.225850 | -0.246723 | -0.106169 | 0.296239 | -0.081555 | -0.316288 | -0.176459 | -0.030233 | -0.013972 | -0.189801 | 0.204183 | 0.196081 | -0.133140 | -0.098637 | 0.184809 | -0.158221 | -0.105178 | -0.010265 | 0.143827 | -0.111082 | 0.010023 | -0.070022 | -0.271469 | -0.048957 | 0.208974 | -0.022513 | 0.057774 | -0.098277 | 0.019785 | -0.054795 | -0.041007 | 0.030711 | -0.165435 | 0.047178 | 0.208223 | -0.075813 | -0.189821 | -0.161922 | -0.224340 | -0.215266 | 0.168054 | 0.021854 | 0.152516 | -0.134594 | -0.042823 | 0.096735 | 0.069236 | -0.258705 | -0.039021 | 0.028002 | -0.085916 | -0.105588 | 0.108640 | 0.099645 | -0.069780 | 0.024125 | 0.039725 | 0.083296 | -0.066083 | -0.130674 | -0.080949 | 0.102462 | -0.101462 | 0.031650 | 0.094067 | 0.087863 | 0.222102 | -0.181195 | -0.120464 | -0.253469 | 0.138050 | -0.057596 | -0.113854 | 0.309566 | 0.040317 | -0.080729 | -0.149938 | 0.221023 | 0.285281 | -0.159688 | -0.130253 | -0.076187 | -0.183568 | -0.352553 | -0.041988 | 0.292411 | -0.159592 | 0.169861 | 0.163765 | 0.438876 | -0.107830 | -0.200781 | 0.079348 | -0.180177 | -0.031166 | 0.134480 | -0.116840 | -0.178634 | -0.049329 | 0.050786 | -0.036438 | -0.089431 | -0.067511 | 0.011603 | -0.057506 | -0.060196 | 0.002193 | -0.014653 | 0.018034 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
| 3 | 16.0 | 5.79 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.929491 | -0.932884 | -0.726056 | -0.114928 | 0.522740 | -0.842807 | 0.088108 | -0.060701 | 0.774992 | -0.580296 | 0.279064 | -0.105791 | 0.090236 | -0.062781 | -0.042130 | -0.646590 | 0.286500 | 0.677719 | -0.147090 | -0.607709 | -0.155483 | -0.242816 | 0.540126 | -0.168444 | 0.004474 | 0.120198 | -0.226751 | -0.051746 | -0.326657 | -0.151226 | -0.290903 | 0.380676 | 0.479114 | -0.127506 | 0.109911 | -0.112562 | 0.113159 | 0.254298 | 0.505504 | 0.095957 | 0.006108 | 0.051951 | -0.279050 | 0.089623 | -0.136662 | -0.127206 | -0.204727 | 0.130743 | 0.196108 | -0.118974 | 0.154051 | 0.366491 | 0.139337 | -0.264714 | -0.002097 | 0.026756 | -0.092753 | 0.165461 | 0.122922 | -0.114801 | -0.255742 | -0.007287 | 0.040428 | 0.111094 | -0.127278 | 0.203404 | -0.121484 | -0.360422 | 0.007067 | -0.168739 | 0.177675 | 0.144146 | 0.089702 | 0.159066 | -0.018784 | 0.006026 | -0.024198 | -0.097841 | 0.115738 | -0.132244 | 0.030508 | 0.075870 | 0.038705 | -0.103404 | -0.010762 | 0.074832 | 0.017567 | -0.143105 | 0.204047 | -0.037974 | 0.382128 | 0.219198 | -0.133037 | 0.108723 | 0.076086 | 0.550112 | -0.114863 | -0.555480 | 0.314994 | 0.283080 | -0.061591 | 0.049609 | -0.198549 | -0.269926 | 0.294977 | 0.122101 | 0.074594 | 0.209514 | -0.074885 | 0.100454 | -0.039714 | -0.049927 | 0.133002 | 0.145055 | 0.225195 | 0.081718 | 0.017266 | 0.027202 | -0.115488 | 0.114858 | -0.019739 | 0.207222 | -0.057635 | 0.103169 | -0.168163 | -0.174054 | -0.082340 | 0.058241 | 0.031017 | 0.202666 | 0.294572 | -0.071566 | -0.247586 | 0.215816 | -0.110945 | -0.247787 | -0.062410 | -0.073789 | -0.337171 | -0.244705 | 0.124334 | -0.174044 | 0.090301 | -0.304759 | -0.057585 | -0.039488 | 0.036029 | -0.010787 | -0.046020 | -0.039824 | -0.136864 | 0.006366 | 0.015453 | 0.105975 | -0.019245 | 0.073369 | -0.301700 | 0.308045 | -0.016559 | 0.224531 | 0.004264 | -0.492750 | 0.278378 | 0.474025 | -0.195136 | -0.025745 | -0.686718 | -0.174993 | 0.080452 | 0.159062 | 0.006103 | -0.051750 | -0.026966 | 0.052650 | -0.040076 | 0.079832 | -0.052376 | -0.055806 | 0.058457 | 0.065146 | -0.096804 | -0.023083 | 0.081917 | -0.106719 | 0.014766 | 0.046211 | 0.085984 | -0.018033 | -0.112645 | 0.020637 | -0.024384 | 0.050251 | 0.006728 | 0.105667 | 0.076595 | 0.171997 | -0.317151 | -0.216691 | -0.235013 | 0.175122 | -0.161660 | -0.247641 | 0.209097 | 0.070422 | -0.047798 | -0.132193 | 0.318713 | 0.262168 | -0.142461 | -0.108562 | -0.055686 | 0.253999 | 0.077172 | -0.033976 | 0.153785 | 0.070240 | -0.017721 | 0.142925 | 0.156166 | 0.217282 | 0.735767 | 0.239146 | 0.161499 | 0.041057 | -0.108523 | 0.162034 | -0.113066 | 0.063340 | 0.125382 | 0.052405 | -0.023821 | 0.057092 | 0.020841 | 0.081334 | -0.024063 | 0.068355 | -0.003734 | -0.024518 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
| 4 | 136.0 | 10.49 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -0.270744 | 0.983164 | 0.345343 | 0.082186 | -0.110755 | -0.321691 | 0.829532 | -0.277355 | 0.007374 | 0.067947 | -0.415692 | 0.200043 | 0.014283 | -0.210471 | -0.731811 | 0.662589 | -0.022502 | -0.070575 | 0.245419 | -0.547920 | 0.798233 | -0.367135 | -0.051795 | -0.228293 | 0.048638 | -0.170178 | -0.147165 | 0.559245 | 0.290505 | 0.012682 | 0.367193 | 0.412463 | -0.002623 | 0.451457 | 0.296433 | -0.390191 | 0.236318 | 0.124057 | -0.037063 | -0.383591 | 0.556176 | -0.070121 | -0.199525 | -0.115132 | -0.090444 | -0.061408 | 0.668433 | -0.248972 | 0.267453 | 0.541314 | 0.177020 | -0.150705 | 0.087225 | -0.259116 | -0.066941 | -0.091808 | 0.027158 | 0.368062 | 0.151258 | 0.233684 | 0.174814 | -0.048585 | 0.257596 | 0.059809 | -0.178357 | -0.016349 | 0.270663 | -0.263091 | 0.235821 | 0.148781 | 0.207593 | -0.270791 | 0.377389 | -0.086384 | -0.098895 | 0.379893 | -0.272484 | 0.110565 | -0.139379 | 0.184985 | 0.145482 | -0.033085 | -0.031522 | -0.155692 | -0.117188 | -0.136447 | 0.131880 | -0.100473 | -0.140170 | -0.192353 | 0.029345 | -0.085022 | 0.101250 | 0.038953 | -0.021798 | 0.240675 | 0.057040 | 0.118590 | -0.299021 | -0.018166 | 0.233100 | 0.416643 | 0.422783 | 0.081388 | -0.255402 | -0.320800 | 0.071673 | -0.229848 | -0.244175 | -0.216232 | -0.185579 | -0.059756 | -0.010926 | 0.164452 | -0.105990 | 0.187536 | -0.095631 | -0.232195 | 0.277726 | 0.242024 | 0.415438 | -0.346141 | 0.224576 | 0.185666 | -0.279517 | 0.336853 | 0.500855 | -0.200641 | 0.178764 | 0.408842 | 0.080450 | 0.206032 | 0.143937 | -0.198519 | 0.356483 | 0.008578 | 0.286385 | -0.150328 | 0.081217 | 0.162339 | -0.048368 | 0.035324 | 0.160576 | 0.013535 | -0.021063 | -0.092352 | -0.082244 | 0.098891 | 0.098428 | -0.004358 | 0.114533 | 0.211289 | 0.295634 | -0.084526 | -0.053110 | 0.164938 | 0.090579 | 0.052545 | -0.059209 | 0.113390 | -0.142641 | -0.128680 | -0.025106 | 0.285039 | 0.025739 | -0.012384 | 0.061502 | -0.237029 | 0.381035 | 0.451849 | -0.029450 | -0.108372 | -0.033305 | -0.139967 | 0.275758 | -0.598785 | 0.074246 | -0.077904 | -0.074210 | 0.290168 | -0.211647 | -0.131071 | 0.545578 | -0.009969 | 0.030380 | -0.177150 | -0.243583 | -0.093152 | 0.404141 | 0.136681 | 0.046191 | 0.099006 | -0.129073 | -0.067438 | 0.050372 | 0.061452 | 0.016641 | 0.029710 | -0.014867 | 0.011403 | 0.078500 | -0.102588 | 0.085155 | 0.014001 | 0.076067 | 0.236509 | 0.002467 | 0.012148 | 0.193899 | -0.052725 | -0.180092 | 0.135548 | -0.062892 | -0.151938 | 0.240784 | 0.190276 | 0.049228 | 0.192550 | -0.107496 | 0.005394 | -0.178423 | -0.281391 | 0.134556 | 0.067239 | -0.605912 | 0.052148 | -0.041095 | 0.188161 | 0.462584 | -0.067987 | -0.153376 | 0.059805 | -0.254568 | 0.153198 | -0.199089 | 0.094935 | -0.468095 | -0.160779 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# based on the elbow method, we're going to define the number of clusters as K = 4
kmeans = KMeans(n_clusters=4)
kmeans.fit(reviews_df)
reviews_df['Cluster'] = kmeans.labels_
clusters_reviews = pd.DataFrame()
clusters_reviews["Cluster"] = kmeans.labels_
clusters_reviews["Percentage_Positive_Reviews"] = perc_reviews
clusters_reviews["Total_User_Reviews"] = reviews_df["Total_User_Reviews"]
clusters_reviews.head()
| Cluster | Percentage_Positive_Reviews | Total_User_Reviews | |
|---|---|---|---|
| 0 | 1 | 86.0 | 235.0 |
| 1 | 3 | 90.0 | 85.0 |
| 2 | 1 | 85.0 | 225.0 |
| 3 | 0 | 81.0 | 16.0 |
| 4 | 2 | 84.0 | 136.0 |
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming you have your 'clusters_reviews' DataFrame ready
# Define a pastel color palette
pastel_colors = sns.color_palette("pastel")
# Iterate through each cluster and create plots
for cluster_id in range(4): # Assuming there are 4 clusters
cluster_data = clusters_reviews[clusters_reviews['Cluster'] == cluster_id]
# Plot a histogram with pastel colors and no outlines
plt.figure(figsize=(8, 6))
sns.histplot(cluster_data["Percentage_Positive_Reviews"], kde=True, color=pastel_colors[cluster_id], edgecolor='none')
plt.title(f'Cluster {cluster_id} - Histogram of Percentage Positive Reviews')
plt.xlabel('Percentage Positive Reviews')
plt.ylabel('Frequency')
plt.savefig(f'Website/images/cluster_{cluster_id}_histogram.png')
plt.show()
plt.close()
# Define a pastel color palette
pastel_colors = sns.color_palette("pastel")
# Set up the matplotlib figure with a 2x2 grid
fig, axes = plt.subplots(2,2, figsize=(14, 10))
# Flatten the axes array for easy iteration
axes_flat = axes.flatten()
# Iterate through each cluster and create plots
for cluster_id in range(4): # Assuming there are 4 clusters
cluster_data = clusters_reviews[clusters_reviews['Cluster'] == cluster_id]
# Plot a histogram with pastel colors and no outlines
sns.histplot(cluster_data["Percentage_Positive_Reviews"], kde=True, color=pastel_colors[cluster_id], edgecolor='none', ax=axes_flat[cluster_id])
axes_flat[cluster_id].set_title(f'Cluster {cluster_id} - Histogram of Percentage Positive Reviews')
axes_flat[cluster_id].set_xlabel('Percentage Positive Reviews')
axes_flat[cluster_id].set_ylabel('Frequency')
# Adjust the layout
plt.tight_layout()
# Show the plots
plt.show()
import pandas as pd
# Assuming you have your 'clusters_reviews' DataFrame ready
# Calculate the median, mean, and std for each cluster
cluster_stats = clusters_reviews.groupby('Cluster')['Percentage_Positive_Reviews'].agg(['median', 'mean', 'std']).reset_index()
# Rename the columns for clarity
cluster_stats.columns = ['Cluster', 'Median', 'Mean', 'Std']
# Print or display the cluster statistics
cluster_stats
| Cluster | Median | Mean | Std | |
|---|---|---|---|---|
| 0 | 0 | 75.0 | 72.449457 | 19.681472 |
| 1 | 1 | 84.0 | 80.881622 | 14.790937 |
| 2 | 2 | 83.0 | 79.108353 | 15.834017 |
| 3 | 3 | 80.0 | 76.671369 | 16.911836 |
# export kmeans to javascript
import json
centroids = kmeans.cluster_centers_.tolist()
# Save the centroids to a JSON file
with open('Website/models/centroids.json', 'w') as f:
json.dump(centroids, f)
cluster_stats
| Cluster | Median | Mean | Std | |
|---|---|---|---|---|
| 0 | 0 | 75.0 | 72.449457 | 19.681472 |
| 1 | 1 | 84.0 | 80.881622 | 14.790937 |
| 2 | 2 | 83.0 | 79.108353 | 15.834017 |
| 3 | 3 | 80.0 | 76.671369 | 16.911836 |
# Calculate the quantiles (e.g., quartiles, deciles, percentiles)
# to show on the website
quantiles = [0, 0.5, 0.75, 0.85,0.95, 0.95, 1] # Adjust as needed
quantile_values = clean_games_df['Total_User_Reviews'].quantile(quantiles)
# Create labels for the quantiles
quantile_labels = ['Bottom 25%', '25%-50%', '50%-75%', 'Top 25%']
pd.DataFrame(quantile_values)