import pandas as pd
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


games_df = pd.read_csv("Dataset/games/merged_data.csv")


games_df.shape

(71700, 16)


# First we're going to set all reviews numbers, percentage and price in separate columns
games_df['Percentage_Positive_Reviews'] = games_df['All Reviews Number'].str.extract(r'(\d+)%').astype(float)
games_df['Percentage_Positive_Reviews_Recent'] = games_df['Recent Reviews Number'].str.extract(r'(\d+)%').astype(float)

games_df['Total_User_Reviews'] = games_df['All Reviews Number'].str.extract(r'([\d,]+) user reviews').replace(',', '', regex=True).astype(float)
games_df['Total_User_Reviews_Recent'] = games_df['Recent Reviews Number'].str.extract(r'([\d,]+) user reviews').replace(',', '', regex=True).astype(float)
games_df["Total_User_Reviews_Summary"] = games_df["Recent Reviews Summary"].str.extract(r'(\d+) user reviews?', expand=False).astype(float)

games_df['Price'] = games_df['Original Price'].replace('Free', '$0.00').str.replace(',', '').str.extract(r'\$(\d+\.?\d*)').astype(float)
games_df['Year_Release'] = games_df['Release Date'].str.extract(r'(20\d{2})', expand=False).astype(float)
games_df.head(3)


# The dataset is updated until early september 2023, so only games with a release date of less or equal than 2021 will be kept
# Null values in Year_Release will also be removed
games_df.dropna(subset=["Year_Release"],inplace=True)
mask_2022 = games_df["Year_Release"]<=2022
games_df = games_df[mask_2022]


# From the dataframe, there's 92% missing values on all reviews
games_df.isna().sum()/len(games_df)*100

Title                                  0.000000
Original Price                         0.000000
Discounted Price                       0.000000
Release Date                           0.000000
Link                                   0.000000
Game Description                       0.269018
Recent Reviews Summary                 6.013675
All Reviews Summary                   91.492303
Recent Reviews Number                  6.013675
All Reviews Number                    91.492303
Developer                              0.308250
Publisher                              0.706172
Supported Languages                    0.000000
Popular Tags                           0.000000
Game Features                          0.000000
Minimum Requirements                   1.944777
Percentage_Positive_Reviews           91.492303
Percentage_Positive_Reviews_Recent    36.197878
Total_User_Reviews                    91.492303
Total_User_Reviews_Recent             36.197878
Total_User_Reviews_Summary            69.815797
Price                                  0.000000
Year_Release                           0.000000
dtype: float64


# In order to minimize the amount of missing values

# Fill missing values in 'Percentage_Positive_Reviews' with values from 'Percentage_Positive_Reviews_Recent'
games_df['Percentage_Positive_Reviews'].fillna(games_df['Percentage_Positive_Reviews_Recent'], inplace=True)

# Fill missing values in 'Total_User_Reviews' using 'Total_User_Reviews_Recent', and then 'Total_User_Reviews_Summary'
games_df['Total_User_Reviews'].fillna(games_df['Total_User_Reviews_Recent'], inplace=True)
games_df['Total_User_Reviews'].fillna(games_df['Total_User_Reviews_Summary'], inplace=True)

# Now we can drop the columns that we used for assisting so it's not confusing:
assisting_columns = ['Percentage_Positive_Reviews_Recent','Total_User_Reviews_Recent','Total_User_Reviews_Summary']
games_df.drop(assisting_columns,axis=1,inplace=True)


# check new percentage of missing values
games_df.isna().sum()/len(games_df)*100

Title                           0.000000
Original Price                  0.000000
Discounted Price                0.000000
Release Date                    0.000000
Link                            0.000000
Game Description                0.269018
Recent Reviews Summary          6.013675
All Reviews Summary            91.492303
Recent Reviews Number           6.013675
All Reviews Number             91.492303
Developer                       0.308250
Publisher                       0.706172
Supported Languages             0.000000
Popular Tags                    0.000000
Game Features                   0.000000
Minimum Requirements            1.944777
Percentage_Positive_Reviews    36.197878
Total_User_Reviews              6.013675
Price                           0.000000
Year_Release                    0.000000
dtype: float64


# We intend to infer that if a game has no total user reviews, then it has 0 reviews
# However when visiting the Link for some of these games, some are well stablished while others do have 0 reviews
# The main difference relies on the Steam Link, where some have /sub/ others have /app/
empty_mask = games_df["Total_User_Reviews"].isna()
empty_df = games_df[empty_mask].reset_index(drop=True)

# We are going to sample the first 10 URLs from the different Links to verify if there's something about them
sample_10_sub = empty_df[empty_df["Link"].str.contains("/sub/")]["Link"]
sample_10_app = empty_df[empty_df["Link"].str.contains("/app/")]["Link"]
print(f"Sample 10 Links with 'sub' (from total {len(sample_10_sub)})")
for e in sample_10_sub[:10]:
    print(e)
print(f"Sample 10 Links with 'app' (from total {len(sample_10_app)})")
for e in sample_10_app[:10]:
    print(e)

Sample 10 Links with 'sub' (from total 133)
https://store.steampowered.com/sub/124923/?snr=1_7_7_230_150_27
https://store.steampowered.com/sub/626153/?snr=1_7_7_230_150_30
https://store.steampowered.com/sub/94174/?snr=1_7_7_230_150_32
https://store.steampowered.com/sub/692569/?snr=1_7_7_230_150_34
https://store.steampowered.com/sub/199943/?snr=1_7_7_230_150_42
https://store.steampowered.com/sub/460/?snr=1_7_7_230_150_49
https://store.steampowered.com/sub/510898/?snr=1_7_7_230_150_55
https://store.steampowered.com/sub/139397/?snr=1_7_7_230_150_65
https://store.steampowered.com/sub/15373/?snr=1_7_7_230_150_66
https://store.steampowered.com/sub/320795/?snr=1_7_7_230_150_67
Sample 10 Links with 'app' (from total 3086)
https://store.steampowered.com/app/60/Ricochet/?snr=1_7_7_230_150_48
https://store.steampowered.com/app/1202100/Bodyless/?snr=1_7_7_230_150_483
https://store.steampowered.com/app/870890/Museum_of_Symmetry/?snr=1_7_7_230_150_502
https://store.steampowered.com/app/984570/Chess_Sphere/?snr=1_7_7_230_150_519
https://store.steampowered.com/app/1935960/Drifters_Tales/?snr=1_7_7_230_150_579
https://store.steampowered.com/app/510000/Xecryst_Remains/?snr=1_7_7_230_150_711
https://store.steampowered.com/app/1919150/Smudge_Coin_Run/?snr=1_7_7_230_150_713
https://store.steampowered.com/app/682290/Zeus_Battlegrounds/?snr=1_7_7_230_150_728
https://store.steampowered.com/app/1444570/Fantasy_Royal_VR/?snr=1_7_7_230_150_733
https://store.steampowered.com/app/973390/EmpiresThe_Rise/?snr=1_7_7_230_150_733


sub_mask = games_df["Link"].str.contains("/sub/")
print("before removing '/sub/' urls from games_df:", len(games_df))
games_df = games_df[~sub_mask]
print("after removing '/sub/' urls from games_df:", len(games_df))

before removing '/sub/' urls from games_df: 53528
after removing '/sub/' urls from games_df: 53392


# For the rest of games in the dataframe, the total amount of reviews should be 0
games_df["Total_User_Reviews"].fillna(0, inplace=True)
# Check the missing values %
games_df.isna().sum()/len(games_df)*100

Title                           0.000000
Original Price                  0.000000
Discounted Price                0.000000
Release Date                    0.000000
Link                            0.000000
Game Description                0.020602
Recent Reviews Summary          5.779892
All Reviews Summary            91.476251
Recent Reviews Number           5.779892
All Reviews Number             91.476251
Developer                       0.059934
Publisher                       0.548771
Supported Languages             0.000000
Popular Tags                    0.000000
Game Features                   0.000000
Minimum Requirements            1.700629
Percentage_Positive_Reviews    36.040980
Total_User_Reviews              0.000000
Price                           0.000000
Year_Release                    0.000000
dtype: float64


# current size of the dataframe
print("current size of the dataframe:",games_df.shape)

current size of the dataframe: (53392, 20)


# reset index of games_df
games_df.reset_index(drop=True,inplace=True)


warnings.filterwarnings('ignore')
# Since there's way too many outliers on the dependant variable (games with many reviews compared to the vast majority which have less reviews)
# we're going to use the Inter Quartile Range method for identifying these outliers and removing them

#  Finding the IQR
percentile25 = games_df['Total_User_Reviews'].quantile(0.25)
percentile75 = games_df['Total_User_Reviews'].quantile(0.75)
iqr = percentile75 - percentile25

# Finding the upper and lower limits
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr

# Finding outliers
outliers_high = games_df[games_df['Total_User_Reviews'] > upper_limit]
outliers_low = games_df[games_df['Total_User_Reviews'] < lower_limit]

# Trimming outliers
trimmed_df = games_df[(games_df['Total_User_Reviews'] < upper_limit) & (games_df['Total_User_Reviews'] > lower_limit)]
trimmed_df.reset_index(drop=True,inplace=True)
# Compare the plots after trimming
plt.figure(figsize=(16, 8))

plt.subplot(2, 2, 1)
sns.distplot(games_df['Total_User_Reviews'])
plt.title('Original Distribution')

plt.subplot(2, 2, 2)
sns.boxplot(games_df['Total_User_Reviews'])
plt.title('Original Boxplot')

plt.subplot(2, 2, 3)
sns.distplot(trimmed_df['Total_User_Reviews'])
plt.title('Trimmed Distribution')

plt.subplot(2, 2, 4)
sns.boxplot(trimmed_df['Total_User_Reviews'])
plt.title('Trimmed Boxplot')

plt.tight_layout()
plt.show()

print("Original Dataset Statistics:")
print(f"Previous dataset size: {len(games_df)}")
print(f"mean: {games_df['Total_User_Reviews'].mean():.2f}")
print(f"max: {games_df['Total_User_Reviews'].max()}")
print(f"min: {games_df['Total_User_Reviews'].min()}")
print(f"std: {games_df['Total_User_Reviews'].std():.2f}")

print("\nTrimmed Dataset Statistics:")
print(f"Current dataset size: {len(trimmed_df)}")
print(f"mean: {trimmed_df['Total_User_Reviews'].mean():.2f}")
print(f"max: {trimmed_df['Total_User_Reviews'].max()}")
print(f"min: {trimmed_df['Total_User_Reviews'].min()}")
print(f"std: {trimmed_df['Total_User_Reviews'].std():.2f}")

# Number of outliers removed
num_outliers_removed = len(games_df) - len(trimmed_df)
print("\nNumber of outliers removed:", num_outliers_removed, f"({num_outliers_removed/len(games_df)})")

Original Dataset Statistics:
Previous dataset size: 53392
mean: 1590.59
max: 7428921.0
min: 0.0
std: 38589.05

Trimmed Dataset Statistics:
Current dataset size: 44516
mean: 36.90
max: 279.0
min: 0.0
std: 55.90

Number of outliers removed: 8876 (0.1662421336529817)


# The dataset statistics have changed significantly, we can now assign the trimmed dataset to games_df
games_df = trimmed_df


# Should we filter the dataset so it only includes games with the "Indie" tag?
mask_indie = games_df["Popular Tags"].str.contains("Indie")
indie_games_df = games_df[mask_indie]
print("only Indie tag dataframe size:",indie_games_df.shape)
print(games_df.shape[0]-indie_games_df.shape[0],"less games")

only Indie tag dataframe size: (29564, 20)
14952 less games


# Mask definitions (assuming mask_indie is defined)
fig, ax = plt.subplots(figsize=(10, 6))  # Create a single subplot

# Boxplot for Indie games
ax.boxplot(games_df[mask_indie]["Total_User_Reviews"].dropna(), positions=[0], vert=False)
# Boxplot for Non-Indie games
ax.boxplot(games_df[~mask_indie]["Total_User_Reviews"].dropna(), positions=[1], vert=False)

ax.set_yticklabels(['Indie', 'Non-Indie'])  # Set y-axis labels
ax.set_xlabel('Total User Reviews')
ax.set_title('Comparison of Total User Reviews (Indie vs. Non-Indie games)')

# Add a note under the plot
plt.text(0.5, -0.3, "Visually, both boxplots don't show too different of a distribution",
         horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)


plt.tight_layout()
plt.show()

# Output lengths
print(len(games_df[mask_indie]["Total_User_Reviews"].dropna()))
print(len(games_df[~mask_indie]["Total_User_Reviews"].dropna()))

29564
14952


# Check if we should do some kind of treatment for free to play games
f2p_mask = (games_df["Original Price"].str.contains("Free"))

fig, ax = plt.subplots(figsize=(10, 6))  # Create a single subplot

# Boxplot for Non-F2P games
ax.boxplot(games_df[~f2p_mask]["Total_User_Reviews"].dropna(), positions=[0], vert=False)
# Boxplot for F2P games
ax.boxplot(games_df[f2p_mask]["Total_User_Reviews"].dropna(), positions=[1], vert=False)

ax.set_yticklabels(['Non-F2P', 'F2P'])  # Set y-axis labels
ax.set_xlabel('Total User Reviews')
ax.set_title('Comparison of Total User Reviews (F2P vs. Non-F2P games)')

# Add a note under the plot
plt.text(0.5, -0.3, "Based on the boxplots, Non-F2P games have much less user reviews when compared to F2P games",
         horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)

plt.tight_layout()
plt.show()


# Because the distributions differ a lot foro F2P and non-FTP games, we'll create a new variable that differentiates them
games_df["F2P"] = games_df["Original Price"].str.contains("Free").astype(int)


# IQR method will be used again to see what values can we keep
warnings.filterwarnings('ignore')
# Since there's way too many outliers on the dependant variable, we're going to use the Inter Quartile Range method
# For identifying and removing them

#  Finding the IQR
percentile25 = games_df['Price'].quantile(0.25)
percentile75 = games_df['Price'].quantile(0.75)
iqr = percentile75 - percentile25

# Finding the upper and lower limits
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr

# Finding outliers
outliers_high = games_df[games_df['Price'] > upper_limit]
outliers_low = games_df[games_df['Price'] < lower_limit]

# Trimming outliers
trimmed_df = games_df[(games_df['Price'] < upper_limit) & (games_df['Price'] > lower_limit)]
trimmed_df.reset_index(drop=True,inplace=True)
# Compare the plots after trimming
plt.figure(figsize=(16, 8))

plt.subplot(2, 2, 1)
sns.distplot(games_df['Price'])
plt.title('Original Distribution')

plt.subplot(2, 2, 2)
sns.boxplot(games_df['Price'])
plt.title('Original Boxplot')

plt.subplot(2, 2, 3)
sns.distplot(trimmed_df['Price'])
plt.title('Trimmed Distribution')

plt.subplot(2, 2, 4)
sns.boxplot(trimmed_df['Price'])
plt.title('Trimmed Boxplot')

plt.tight_layout()
plt.show()

print("Original Dataset Statistics:")
print(f"Previous dataset size: {len(games_df)}")
print(f"mean: {games_df['Price'].mean():.2f}")
print(f"max: {games_df['Price'].max()}")
print(f"min: {games_df['Price'].min()}")
print(f"std: {games_df['Price'].std():.2f}")

print("\nTrimmed Dataset Statistics:")
print(f"Current dataset size: {len(trimmed_df)}")
print(f"mean: {trimmed_df['Price'].mean():.2f}")
print(f"max: {trimmed_df['Price'].max()}")
print(f"min: {trimmed_df['Price'].min()}")
print(f"std: {trimmed_df['Price'].std():.2f}")

# Number of outliers removed
num_outliers_removed = len(games_df) - len(trimmed_df)
print("\nNumber of outliers removed:", num_outliers_removed,f"({num_outliers_removed/len(games_df)})")

Original Dataset Statistics:
Previous dataset size: 44516
mean: 952.08
max: 14046333.0
min: 0.0
std: 115306.99

Trimmed Dataset Statistics:
Current dataset size: 43303
mean: 3.78
max: 13.57
min: 0.0
std: 3.02

Number of outliers removed: 1213 (0.02724862970617306)


# Let's assign games_df to the new trimmed dataset
games_df = trimmed_df


game_feat = " ".join(games_df["Game Features"].astype(str))
game_feat = game_feat.replace("[",",").replace("]",",")
game_feat_list = [el.strip() for el in game_feat.split(",")]

game_feat_pd = pd.DataFrame(game_feat_list)
value_counts_gf = game_feat_pd.value_counts()
print(len(value_counts_gf))

33


# Plotting the horizontal bar chart
plt.figure(figsize=(10, 8))
ax = value_counts_gf.plot(kind='barh')  # Create the bar plot

plt.title('Game Features')
plt.xlabel('Counts')
plt.ylabel('Categories')
plt.gca().invert_yaxis()  # Invert y-axis to display the highest count at the top
plt.ticklabel_format(style='plain', axis='x')  # Disable scientific notation on x-axis

# Adding text annotations for values at the end of the bars
for i, v in enumerate(value_counts_gf):
    ax.text(v + 3, i, str(v), color='black', va='center')  # Adjust the "+3" offset for text placement

plt.tight_layout()
plt.show()


# based on the value counts of "Game Features", we are going to keep all of the features that 
# have a frequency higher than 4 and remove the empty values
value_counts_gf = value_counts_gf[value_counts_gf>4][1:]
value_counts_gf

'Single-player'                 41738
'Steam Achievements'            20277
'Steam Cloud'                    9134
'Full controller support'        8824
'Partial Controller Support'     5871
'Steam Trading Cards'            4904
'Steam Leaderboards'             4189
'Remote Play Together'           3679
'Tracked Controller Support'     3549
'VR Only'                        3218
'Online PvP'                     3035
'Shared/Split Screen PvP'        2537
'Stats'                          2164
'Shared/Split Screen Co-op'      1901
'Online Co-op'                   1678
'Cross-Platform Multiplayer'     1042
'Includes level editor'           992
'Remote Play on TV'               867
'Steam Workshop'                  712
'Captions available'              669
'In-App Purchases'                596
'VR Supported'                    521
'LAN PvP'                         341
'MMO'                             307
'LAN Co-op'                       278
'Remote Play on Phone'            195
'Remote Play on Tablet'           192
'Commentary available'            115
'Steam Turn Notifications'         53
'Includes Source SDK'              15
'SteamVR Collectibles'              8
dtype: int64


value_counts_gf.shape

(31,)


games_df.reset_index(drop=True,inplace=True)


# Creating the dummy variables for game_features
game_features_list = [el[0].replace("'","") for el in value_counts_gf.index]
for feature in game_features_list:
    games_df[feature] = 0
for i in range(len(games_df)):
    for feature in game_features_list:
        if feature in games_df["Game Features"][i]:
            games_df.loc[i, feature] = 1


games_df.head(2)


# Let's see how many tags are we dealing with
all_tags = " ".join(games_df["Popular Tags"].astype(str))
all_tags = all_tags.replace("[",",").replace("]",",")
all_tags_list = [el.strip() for el in all_tags.split(",")]

all_tags_pd = pd.DataFrame(all_tags_list)
value_counts = all_tags_pd.value_counts()
print(len(value_counts))
value_counts[value_counts>0]

441

                  43312
'Indie'           29096
'Singleplayer'    21521
'Casual'          20822
'Action'          19572
                  ...  
'Shop Keeper'         4
'Fox'                 4
'TrackIR'             4
'Reboot'              3
'Birds'               1
Length: 441, dtype: int64


# Convert the string representations of lists to actual lists
games_df['Popular Tags'] = games_df['Popular Tags'].apply(lambda x: ast.literal_eval(x))


# Apply get_dummies directly on the 'Popular Tags' column
tags_dummies_df = games_df['Popular Tags'].str.join('|').str.get_dummies()
tags_dummies_df


import numpy as np
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(tags_dummies_df) 

# Calculate cumulative explained variance ratio
cumulative_var = np.cumsum(pca.explained_variance_ratio_)

# Plot the cumulative explained variance ratio
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(cumulative_var) + 1), cumulative_var, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance Ratio by Number of Components')
plt.grid(True)

threshold = 0.95  # Set the threshold here
plt.axhline(y=threshold, color='r', linestyle='--', label=f'{threshold*100}% Threshold')
plt.legend()


plt.show()

# Find the number of components that achieve the desired threshold
num_components = np.argmax(cumulative_var >= threshold) + 1

print(f"Number of components needed to achieve {threshold*100}% threshold: {num_components}")

Number of components needed to achieve 95.0% threshold: 238


# Since there is way too many variables and we would like for the user to be able to input as many as they want
# We'll apply Principal Component Analysis (PCA) to summarize the tags so that the predictive model can handle them
# more efficiently
from sklearn.decomposition import PCA

pca = PCA(n_components=num_components)  # Specify the desired number of components based on the previous plot
pca.fit(tags_dummies_df)  
# Transform the data using PCA
df_pca = pd.DataFrame(pca.transform(tags_dummies_df), columns=[f'PCA_{i}' for i in range(pca.n_components_)])

df_pca.head()


# Save the PCA components to use on the website
np.savetxt('pca_components.csv', pca.components_, delimiter=',')


# Concatenate the PCA features into the games_df dataframe 
games_df = pd.concat([games_df, df_pca], axis=1)
games_df.head(1)


# only going to include top 10 languages (Excluding english, since it's supported for most games)
languages = ["German","French","Russian","Spanish - Spain","Simplified Chinese","Japanese","Portuguese - Brazil","Korean"]
for l in languages:
    games_df[l] = 0
games_df.head(2)


for i in range(len(games_df)):
    for l in languages:
        if l in games_df["Supported Languages"][i]:
            games_df.loc[i,l] = 1


# After implementing languages, we're going to remove other columns that won't be used
remove_cols = ["Title","Original Price","Discounted Price","Release Date","Link","Game Description","Recent Reviews Summary",
         "All Reviews Summary","Recent Reviews Number","All Reviews Number","Developer","Publisher","Supported Languages",
               "Popular Tags","Game Features","Minimum Requirements","Year_Release","Percentage_Positive_Reviews"]
clean_games_df = games_df.copy()
clean_games_df.drop(remove_cols,axis=1,inplace=True)
# Now we're going to remove rows with missing values
clean_games_df.dropna(inplace=True)
print("Shape of dataset before formatting:",games_df.shape)
print("Shape of dataset after formatting:",clean_games_df.shape)

Shape of dataset before formatting: (43303, 298)
Shape of dataset after formatting: (43303, 280)


# transform the column names so there is no strange characters
clean_games_df.columns = [col.replace(" ","_").replace("-","_").replace("/","_") for col in clean_games_df.columns]


min_price = 0
# Setting up the matplotlib figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plotting Total_User_Reviews vs. Price > min_price on the first subplot
sns.regplot(
    ax=axes[0],
    x=clean_games_df[clean_games_df["Price"] > min_price]["Price"],
    y=clean_games_df[clean_games_df["Price"] > min_price]["Total_User_Reviews"],
    lowess=True,
    line_kws={'color': 'black'}
)
axes[0].set_title(f"Total_User_Reviews vs. Price > {min_price}")

# Plotting Total_User_Reviews vs. Price on the second subplot
sns.regplot(
    ax=axes[1],
    x=clean_games_df["Price"],
    y=clean_games_df["Total_User_Reviews"],
    lowess=True,
    line_kws={'color': 'black'}
)
axes[1].set_title("Total_User_Reviews vs. Price")

# Displaying the plots side by side
plt.tight_layout()
plt.show()


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Splitting the data into features (X) and target variable (y)
X = clean_games_df.drop('Total_User_Reviews', axis=1)  # Features
y = clean_games_df['Total_User_Reviews']  # Target variable

# Performing train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = linear_model.predict(X_test)

# Calculating evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mask = y_test != 0
mape = (np.mean(np.abs((y_test[mask] - y_pred[mask]) / y_test[mask])) * 100) if np.any(mask) else np.nan

# Displaying performance metrics
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Calculate Standard Deviation of Errors
std_error = np.std(y_test - y_pred)
print(f"Standard Deviation of Errors: {std_error:.2f}")

Mean Squared Error (MSE): 2273.12
R-squared (R2): 0.2750
Mean Absolute Error (MAE): 30.98
Mean Absolute Percentage Error (MAPE): 361.34%
Root Mean Squared Error (RMSE): 47.68
Standard Deviation of Errors: 47.67


# Extracting coefficients and intercept
coefficients = linear_model.coef_
intercept = linear_model.intercept_

# Optionally, save to a file
with open("model_coefficients.txt", "w") as file:
    file.write(f"Coefficients: {coefficients.tolist()}\n")
    file.write(f"Intercept: {intercept}\n")


# Get the coefficients and corresponding feature names
coefficients = linear_model.coef_
feature_names = X_train.columns  # Assuming these are the feature names used in the model

# Create a DataFrame to associate coefficients with feature names
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Get the top 10 most influential coefficients by absolute value
top_10_coefficients = coef_df.reindex(coef_df['Coefficient'].abs().sort_values(ascending=False).index)[:10]

print("Top 10 Most Influential Coefficients:")
print(top_10_coefficients)

Top 10 Most Influential Coefficients:
                   Feature  Coefficient
32    SteamVR_Collectibles    51.097263
7      Steam_Trading_Cards    39.937248
1                      F2P    26.723267
28   Remote_Play_on_Tablet    22.446627
27    Remote_Play_on_Phone   -18.116847
198                PCA_165    17.884798
22        In_App_Purchases    15.187831
264                PCA_231   -14.612801
25                     MMO    14.597627
200                PCA_167    13.926585


# Step 1: Extract the PCA component loadings
component_number = int(input("Query PCA component number:"))  # Change this to the desired component number
component_loadings = pca.components_[component_number]

# Step 2: Identify the highest loadings (positive or negative)
# You can use the absolute values of loadings to find the most related categories
absolute_loadings = component_loadings

# Sort the indices of loadings in descending order
sorted_indices = absolute_loadings.argsort()[::-1]

# Retrieve the feature names or category names based on the sorted indices
feature_names = list(tags_dummies_df.columns)  # Replace with your feature DataFrame
related_categories = [feature_names[i] for i in sorted_indices]

# Set the number of top categories you want to display
top_n = 5  # Change this to the desired number

# Print the top N categories with their loading values
print(f"Top {top_n} Positively Associated Categories for PCA Component {component_number}:")
for i in range(top_n):
    category = feature_names[sorted_indices[i]]
    loading_value = component_loadings[sorted_indices[i]]
    print(f"{category}: {loading_value:.4f}")

Query PCA component number:165
Top 5 Positively Associated Categories for PCA Component 165:
Classic: 0.5705
Collectathon: 0.1762
Destruction: 0.1672
Modern: 0.1377
Detective: 0.1299


remove_cols = ["Title","Original Price","Discounted Price","Release Date","Link","Game Description","Recent Reviews Summary",
         "All Reviews Summary","Recent Reviews Number","All Reviews Number","Developer","Publisher","Supported Languages",
               "Popular Tags","Game Features","Minimum Requirements","Year_Release"]
reviews_df = games_df.copy()
reviews_df.drop(remove_cols,axis=1,inplace=True)
# Now we're going to remove rows with missing values
reviews_df.dropna(inplace=True)
print("Shape of dataset before formatting:",games_df.shape)
print("Shape of dataset after formatting:",reviews_df.shape)

Shape of dataset before formatting: (43303, 298)
Shape of dataset after formatting: (24357, 281)


perc_reviews = reviews_df["Percentage_Positive_Reviews"]
reviews_df.drop("Percentage_Positive_Reviews",axis=1,inplace=True)


print("Minimum amount of reviews for the game to have a %:", reviews_df["Total_User_Reviews"].min())

Minimum amount of reviews for the game to have a %: 10.0


import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Calculate the sum of squared distances for a range of cluster numbers
sse = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(reviews_df)
    sse.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 10), sse, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Squared Distances (SSE)')
plt.grid(True)
plt.show()


pd.set_option('display.max_columns', None)

reviews_df.iloc[:,:280].head()


# based on the elbow method, we're going to define the number of clusters as K = 4
kmeans = KMeans(n_clusters=4)
kmeans.fit(reviews_df)

reviews_df['Cluster'] = kmeans.labels_


clusters_reviews = pd.DataFrame()
clusters_reviews["Cluster"] = kmeans.labels_
clusters_reviews["Percentage_Positive_Reviews"] = perc_reviews
clusters_reviews["Total_User_Reviews"] = reviews_df["Total_User_Reviews"]
clusters_reviews.head()


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you have your 'clusters_reviews' DataFrame ready

# Define a pastel color palette
pastel_colors = sns.color_palette("pastel")

# Iterate through each cluster and create plots
for cluster_id in range(4):  # Assuming there are 4 clusters
    cluster_data = clusters_reviews[clusters_reviews['Cluster'] == cluster_id]
    
    # Plot a histogram with pastel colors and no outlines
    plt.figure(figsize=(8, 6))
    sns.histplot(cluster_data["Percentage_Positive_Reviews"], kde=True, color=pastel_colors[cluster_id], edgecolor='none')
    plt.title(f'Cluster {cluster_id} - Histogram of Percentage Positive Reviews')
    plt.xlabel('Percentage Positive Reviews')
    plt.ylabel('Frequency')
    plt.savefig(f'Website/images/cluster_{cluster_id}_histogram.png')
    plt.show()
    plt.close()


# Define a pastel color palette
pastel_colors = sns.color_palette("pastel")

# Set up the matplotlib figure with a 2x2 grid
fig, axes = plt.subplots(2,2, figsize=(14, 10))

# Flatten the axes array for easy iteration
axes_flat = axes.flatten()

# Iterate through each cluster and create plots
for cluster_id in range(4):  # Assuming there are 4 clusters
    cluster_data = clusters_reviews[clusters_reviews['Cluster'] == cluster_id]

    # Plot a histogram with pastel colors and no outlines
    sns.histplot(cluster_data["Percentage_Positive_Reviews"], kde=True, color=pastel_colors[cluster_id], edgecolor='none', ax=axes_flat[cluster_id])
    axes_flat[cluster_id].set_title(f'Cluster {cluster_id} - Histogram of Percentage Positive Reviews')
    axes_flat[cluster_id].set_xlabel('Percentage Positive Reviews')
    axes_flat[cluster_id].set_ylabel('Frequency')

# Adjust the layout
plt.tight_layout()

# Show the plots
plt.show()


import pandas as pd

# Assuming you have your 'clusters_reviews' DataFrame ready
# Calculate the median, mean, and std for each cluster
cluster_stats = clusters_reviews.groupby('Cluster')['Percentage_Positive_Reviews'].agg(['median', 'mean', 'std']).reset_index()

# Rename the columns for clarity
cluster_stats.columns = ['Cluster', 'Median', 'Mean', 'Std']

# Print or display the cluster statistics
cluster_stats


# export kmeans to javascript
import json

centroids = kmeans.cluster_centers_.tolist()

# Save the centroids to a JSON file
with open('Website/models/centroids.json', 'w') as f:
    json.dump(centroids, f)


cluster_stats


# Calculate the quantiles (e.g., quartiles, deciles, percentiles)
# to show on the website
quantiles = [0, 0.5, 0.75, 0.85,0.95, 0.95, 1]  # Adjust as needed
quantile_values = clean_games_df['Total_User_Reviews'].quantile(quantiles)

# Create labels for the quantiles
quantile_labels = ['Bottom 25%', '25%-50%', '50%-75%', 'Top 25%']

pd.DataFrame(quantile_values)

	Title	Original Price	Discounted Price	Release Date	Link	Game Description	Recent Reviews Summary	All Reviews Summary	Recent Reviews Number	All Reviews Number	...	Popular Tags	Game Features	Minimum Requirements	Percentage_Positive_Reviews	Percentage_Positive_Reviews_Recent	Total_User_Reviews	Total_User_Reviews_Recent	Total_User_Reviews_Summary	Price	Year_Release
0	Baldur's Gate 3	$29.99	$29.99	3 Aug, 2023	https://store.steampowered.com/app/1086940/Bal...	Baldur’s Gate 3 is a story-rich, party-based R...	Overwhelmingly Positive	Very Positive	- 96% of the 128,900 user reviews in the last ...	- 94% of the 188,617 user reviews for this gam...	...	['RPG', 'Choices Matter', 'Character Customiza...	['Single-player', 'Online Co-op', 'LAN Co-op',...	Requires a 64-bit processor and operating syst...	94.0	96.0	188617.0	128900.0	NaN	29.99	2023.0
1	Counter-Strike: Global Offensive	$14.99	$14.99	21 Aug, 2012	https://store.steampowered.com/app/730/Counter...	Counter-Strike: Global Offensive (CS: GO) expa...	Very Positive	Very Positive	- 89% of the 75,284 user reviews in the last 3...	- 88% of the 7,428,921 user reviews for this g...	...	['FPS', 'Shooter', 'Multiplayer', 'Competitive...	['Steam Achievements', 'Full controller suppor...	OS: \| Windows® 7/Vista/XP \| Processor: \| Int...	88.0	89.0	7428921.0	75284.0	NaN	14.99	2012.0
2	Apex Legends™	Free	Free	4 Nov, 2020	https://store.steampowered.com/app/1172470/Ape...	Apex Legends is the award-winning, free-to-pla...	Mixed	Very Positive	- 65% of the 18,581 user reviews in the last 3...	- 80% of the 701,597 user reviews for this gam...	...	['Free to Play', 'Multiplayer', 'Battle Royale...	['Online PvP', 'Online Co-op', 'Steam Achievem...	Requires a 64-bit processor and operating syst...	80.0	65.0	701597.0	18581.0	NaN	0.00	2020.0

	PCA_0	PCA_1	PCA_2	PCA_3	PCA_4	PCA_5	PCA_6	PCA_7	PCA_8	PCA_9	...	PCA_228	PCA_229	PCA_230	PCA_231	PCA_232	PCA_233	PCA_234	PCA_235	PCA_236	PCA_237
0	0.409657	-0.324480	0.591374	-0.847256	0.609709	0.003414	-0.087016	0.017850	0.658866	-0.022729	...	0.117069	-0.097110	-0.206132	0.277560	0.127647	0.162974	-0.130298	0.278489	0.072791	0.065839
1	0.694280	-0.731256	-0.804323	-0.131967	0.679628	-0.766924	0.381220	0.242563	0.913800	-0.630273	...	0.043569	-0.015040	-0.044558	0.030819	-0.001442	0.061336	-0.061068	0.025783	0.030147	-0.007183
2	0.803688	-0.764302	-0.712482	-0.142711	0.702741	-0.562767	0.513145	0.434236	1.110108	-0.440919	...	0.050786	-0.036438	-0.089431	-0.067511	0.011603	-0.057506	-0.060196	0.002193	-0.014653	0.018034
3	0.929491	-0.932884	-0.726056	-0.114928	0.522740	-0.842807	0.088108	-0.060701	0.774992	-0.580296	...	0.125382	0.052405	-0.023821	0.057092	0.020841	0.081334	-0.024063	0.068355	-0.003734	-0.024518
4	-0.270744	0.983164	0.345343	0.082186	-0.110755	-0.321691	0.829532	-0.277355	0.007374	0.067947	...	0.462584	-0.067987	-0.153376	0.059805	-0.254568	0.153198	-0.199089	0.094935	-0.468095	-0.160779

	Title	Original Price	Discounted Price	Release Date	Link	Game Description	Recent Reviews Summary	All Reviews Summary	Recent Reviews Number	All Reviews Number	...	PCA_236	PCA_237	German	French	Russian	Spanish - Spain	Simplified Chinese	Japanese	Portuguese - Brazil	Korean
0	Not Tonight 2	$10.49	$4.72	11 Feb, 2022	https://store.steampowered.com/app/1600370/Not...	Immigration Enforcement Case #112: You are und...	Very Positive	NaN	- 86% of the 235 user reviews for this game ar...	NaN	...	0.072791	0.065839	0	0	0	0	0	0	0	0
1	Forest Camp Story	$5.79	$5.79	27 Nov, 2022	https://store.steampowered.com/app/1983690/For...	Create and customize your own campground in th...	Very Positive	NaN	- 90% of the 85 user reviews for this game are...	NaN	...	0.030147	-0.007183	0	0	0	0	0	0	0	0

	Total_User_Reviews	Price	Single-player	Steam Achievements	Steam Cloud	Full controller support	Tracked Controller Support	VR Only	PCA_0	PCA_1	PCA_2	PCA_3	PCA_4	PCA_5	PCA_6	PCA_7	PCA_8	PCA_9	PCA_10	PCA_11	PCA_12	PCA_13	PCA_14	PCA_15	PCA_16	PCA_17	PCA_18	PCA_19	PCA_20	PCA_21	PCA_22	PCA_23	PCA_24	PCA_25	PCA_26	PCA_27	PCA_28	PCA_29	PCA_30	PCA_31	PCA_32	PCA_33	PCA_34	PCA_35	PCA_36	PCA_37	PCA_38	PCA_39	PCA_40	PCA_41	PCA_42	PCA_43	PCA_44	PCA_45	PCA_46	PCA_47	PCA_48	PCA_49	PCA_50	PCA_51	PCA_52	PCA_53	PCA_54	PCA_55	PCA_56	PCA_57	PCA_58	PCA_59	PCA_60	PCA_61	PCA_62	PCA_63	PCA_64	PCA_65	PCA_66	PCA_67	PCA_68	PCA_69	PCA_70	PCA_71	PCA_72	PCA_73	PCA_74	PCA_75	PCA_76	PCA_77	PCA_78	PCA_79	PCA_80	PCA_81	PCA_82	PCA_83	PCA_84	PCA_85	PCA_86	PCA_87	PCA_88	PCA_89	PCA_90	PCA_91	PCA_92	PCA_93	PCA_94	PCA_95	PCA_96	PCA_97	PCA_98	PCA_99	PCA_100	PCA_101	PCA_102	PCA_103	PCA_104	PCA_105	PCA_106	PCA_107	PCA_108	PCA_109	PCA_110	PCA_111	PCA_112	PCA_113	PCA_114	PCA_115	PCA_116	PCA_117	PCA_118	PCA_119	PCA_120	PCA_121	PCA_122	PCA_123	PCA_124	PCA_125	PCA_126	PCA_127	PCA_128	PCA_129	PCA_130	PCA_131	PCA_132	PCA_133	PCA_134	PCA_135	PCA_136	PCA_137	PCA_138	PCA_139	PCA_140	PCA_141	PCA_142	PCA_143	PCA_144	PCA_145	PCA_146	PCA_147	PCA_148	PCA_149	PCA_150	PCA_151	PCA_152	PCA_153	PCA_154	PCA_155	PCA_156	PCA_157	PCA_158	PCA_159	PCA_160	PCA_161	PCA_162	PCA_163	PCA_164	PCA_165	PCA_166	PCA_167	PCA_168	PCA_169	PCA_170	PCA_171	PCA_172	PCA_173	PCA_174	PCA_175	PCA_176	PCA_177	PCA_178	PCA_179	PCA_180	PCA_181	PCA_182	PCA_183	PCA_184	PCA_185	PCA_186	PCA_187	PCA_188	PCA_189	PCA_190	PCA_191	PCA_192	PCA_193	PCA_194	PCA_195	PCA_196	PCA_197	PCA_198	PCA_199	PCA_200	PCA_201	PCA_202	PCA_203	PCA_204	PCA_205	PCA_206	PCA_207	PCA_208	PCA_209	PCA_210	PCA_211	PCA_212	PCA_213	PCA_214	PCA_215	PCA_216	PCA_217	PCA_218	PCA_219	PCA_220	PCA_221	PCA_222	PCA_223	PCA_224	PCA_225	PCA_226	PCA_227	PCA_228	PCA_229	PCA_230	PCA_231	PCA_232	PCA_233	PCA_234	PCA_235	PCA_236	PCA_237	German	French	Russian	Spanish - Spain	Simplified Chinese	Japanese	Korean
0	235.0	10.49	1	0	0	0	0	0	0.409657	-0.324480	0.591374	-0.847256	0.609709	0.003414	-0.087016	0.017850	0.658866	-0.022729	0.422741	-0.736449	0.024179	0.332598	-0.155874	-0.082455	0.403533	0.258488	-0.617579	-0.035172	-0.242397	-0.504659	-0.758944	0.377408	0.256094	0.754386	0.141912	0.172213	-0.530914	-0.375347	-0.317421	0.063041	-0.157411	-0.033311	0.488900	-0.153821	0.013936	-0.088348	-0.100366	0.199055	-0.040522	0.397522	-0.021696	-0.343000	0.140135	0.006366	-0.072245	-0.102702	-0.030086	-0.275548	0.358312	0.082344	0.063584	-0.016916	0.228669	0.180594	0.040635	-0.180290	0.038063	-0.207103	-0.554190	0.576845	0.204225	0.203906	-0.014385	0.240292	0.263264	-0.097004	-0.000072	0.090535	-0.051816	-0.000427	-0.206293	-0.322158	0.047666	0.067689	-0.284267	0.152211	-0.073938	0.049885	0.135372	-0.031453	-0.148448	0.353353	0.083490	-0.234919	0.170601	0.211671	0.124677	0.215157	0.096982	0.001342	-0.375367	-0.294239	0.329806	0.611630	0.403976	0.866488	0.016706	-0.319775	0.049421	0.193033	0.073478	-0.335907	0.145451	0.187441	-0.107615	-0.152358	-0.059214	-0.122056	-0.036222	0.059762	-0.046586	0.078459	-0.088756	-0.071529	0.233146	0.217579	0.193820	0.172144	-0.216722	-0.261950	0.041870	-0.066101	0.160970	-0.161009	-0.093230	0.009910	0.267075	0.025416	0.389662	-0.270292	-0.269544	-0.132789	0.198340	-0.156007	-0.299557	0.256770	-0.257219	0.011908	-0.522128	-0.000429	-0.232961	0.160617	0.170392	-0.092830	-0.146597	0.227432	-0.078216	0.137583	-0.140770	-0.193893	0.063626	0.120977	-0.062667	0.013888	-0.193499	-0.024368	-0.103566	0.104873	0.097802	-0.303384	0.144505	0.220896	-0.017473	0.165641	-0.263065	-0.206114	0.444111	-0.432401	-0.354680	-0.293027	0.354977	0.074068	0.326064	0.150394	0.265315	-0.353465	0.328251	0.315179	0.508689	-0.196024	0.211125	-0.411297	-0.165329	0.435495	0.211316	-0.069842	-0.024485	-0.164164	0.208819	-0.229696	0.109706	0.212416	-0.021532	0.175783	-0.057621	0.319950	0.247179	0.383641	0.315966	-0.248953	-0.239320	0.048829	0.062792	-0.067516	-0.144538	-0.022905	0.014889	-0.057549	0.044598	-0.234102	-0.093890	-0.264132	-0.004309	-0.112905	-0.133020	0.066963	0.043303	0.038266	-0.225912	0.275565	0.242087	-0.153624	-0.041829	0.113284	-0.155247	0.297862	0.117069	-0.097110	-0.206132	0.277560	0.127647	0.162974	-0.130298	0.278489	0.072791	0.065839	1	1	0	1	1	0	0
1	85.0	5.79	1	1	1	1	0	0	0.694280	-0.731256	-0.804323	-0.131967	0.679628	-0.766924	0.381220	0.242563	0.913800	-0.630273	0.216925	-0.047639	0.139831	-0.049149	0.196698	-0.567477	-0.272681	0.573347	-0.051737	-0.585852	-0.204863	-0.114170	0.537252	-0.007251	-0.028970	0.068364	-0.201951	-0.078865	-0.151912	-0.116429	-0.304233	0.278998	0.419693	-0.056275	0.017360	-0.077931	0.126280	0.287068	0.532625	0.106428	-0.000206	0.008367	-0.279171	0.119689	-0.021284	-0.190269	-0.129869	0.139435	0.205165	-0.036536	0.156708	0.337531	-0.001929	-0.262832	0.055594	-0.065271	0.001902	0.154905	0.072123	-0.157411	-0.138491	-0.066429	0.096890	0.042244	-0.085258	0.214388	-0.072919	-0.277029	-0.060948	-0.204603	0.135083	0.079547	0.077536	0.226930	-0.060401	0.009168	-0.015340	-0.071728	0.111302	-0.103752	0.017916	0.071730	0.041618	-0.131824	-0.020318	0.093251	0.013369	-0.143661	0.191193	-0.043753	0.343006	0.194677	-0.131906	0.106923	0.063810	0.513206	-0.116143	-0.530287	0.292702	0.259705	-0.032076	0.024206	-0.198518	-0.240365	0.295485	0.127039	0.033787	0.218871	-0.048416	0.083146	-0.031827	-0.013676	0.137829	0.138090	0.233207	0.079077	-0.000805	0.022617	-0.131520	0.142971	-0.018555	0.193114	-0.031982	0.092506	-0.181376	-0.174573	-0.081409	0.041785	0.015361	0.193292	0.310046	-0.075045	-0.249411	0.229453	-0.102497	-0.259892	-0.058988	-0.092342	-0.326359	-0.263733	0.168374	-0.151889	0.095384	-0.307994	-0.052884	-0.044226	0.035100	-0.018696	-0.032355	-0.065669	-0.137731	-0.004996	0.019167	0.077318	-0.012581	0.080013	-0.271913	0.274094	-0.029200	0.222342	0.007607	-0.431789	0.252465	0.414445	-0.158658	-0.018193	-0.629965	-0.184557	0.058902	0.142436	0.004293	-0.035554	-0.023122	0.043914	-0.015136	0.058832	-0.060033	-0.046239	0.059668	0.037756	-0.104149	-0.017250	0.089835	-0.090574	0.013425	0.023991	0.095297	-0.035234	-0.116351	0.008288	-0.025400	0.027516	0.020887	0.106122	0.060141	0.203445	-0.324235	-0.205319	-0.206892	0.173966	-0.152762	-0.258100	0.195660	0.054311	-0.084749	-0.149885	0.271648	0.231872	-0.141589	-0.147036	-0.002746	0.170092	0.011895	-0.074583	0.051157	-0.029075	-0.023004	0.070420	0.165893	0.045131	0.179958	0.119888	0.116236	0.025882	-0.040671	0.092886	-0.093597	0.047767	0.043569	-0.015040	-0.044558	0.030819	-0.001442	0.061336	-0.061068	0.025783	0.030147	-0.007183	1	1	1	0	1	1	1
2	225.0	5.79	1	1	1	1	0	0	0.803688	-0.764302	-0.712482	-0.142711	0.702741	-0.562767	0.513145	0.434236	1.110108	-0.440919	0.416820	0.375817	0.131907	-0.247215	0.344559	-0.291005	-0.165455	0.716817	-0.282792	-0.336283	-0.066810	0.058990	0.224290	-0.224325	0.000569	0.005608	-0.239348	0.112613	-0.257442	-0.299611	-0.479394	0.412209	0.114827	0.018720	-0.037424	0.092506	0.045463	0.147886	0.418028	-0.122592	-0.118099	0.164522	-0.220075	0.082377	0.155993	-0.110947	0.074377	0.048933	0.146834	-0.026115	-0.032453	0.474082	0.364122	0.407795	-0.177835	0.311574	0.583603	0.245423	0.136955	-0.394393	-0.089323	-0.086022	-0.436033	0.245131	-0.159452	-0.161680	0.139590	-0.291059	-0.135516	0.339074	0.092451	-0.029702	-0.034609	0.332388	-0.078524	-0.018802	-0.144803	-0.051464	0.037284	0.231912	0.219138	-0.171241	0.210746	-0.400567	0.485344	-0.331857	-0.006506	0.140256	0.141687	0.204173	0.388371	0.058433	0.051556	-0.157573	0.356370	0.186920	0.037566	0.431225	0.196193	0.075336	0.622562	0.150917	0.049859	-0.309979	0.174709	0.182294	-0.337023	-0.329030	-0.058266	0.312668	0.171151	-0.238385	-0.277395	0.154660	0.607164	-0.176054	-0.029586	0.330116	-0.625818	0.595745	-0.224860	0.183875	0.047668	0.258280	-0.589725	-0.330339	-0.076033	0.212671	0.049748	-0.225850	-0.246723	-0.106169	0.296239	-0.081555	-0.316288	-0.176459	-0.030233	-0.013972	-0.189801	0.204183	0.196081	-0.133140	-0.098637	0.184809	-0.158221	-0.105178	-0.010265	0.143827	-0.111082	0.010023	-0.070022	-0.271469	-0.048957	0.208974	-0.022513	0.057774	-0.098277	0.019785	-0.054795	-0.041007	0.030711	-0.165435	0.047178	0.208223	-0.075813	-0.189821	-0.161922	-0.224340	-0.215266	0.168054	0.021854	0.152516	-0.134594	-0.042823	0.096735	0.069236	-0.258705	-0.039021	0.028002	-0.085916	-0.105588	0.108640	0.099645	-0.069780	0.024125	0.039725	0.083296	-0.066083	-0.130674	-0.080949	0.102462	-0.101462	0.031650	0.094067	0.087863	0.222102	-0.181195	-0.120464	-0.253469	0.138050	-0.057596	-0.113854	0.309566	0.040317	-0.080729	-0.149938	0.221023	0.285281	-0.159688	-0.130253	-0.076187	-0.183568	-0.352553	-0.041988	0.292411	-0.159592	0.169861	0.163765	0.438876	-0.107830	-0.200781	0.079348	-0.180177	-0.031166	0.134480	-0.116840	-0.178634	-0.049329	0.050786	-0.036438	-0.089431	-0.067511	0.011603	-0.057506	-0.060196	0.002193	-0.014653	0.018034	0	0	0	0	1	1	1
3	16.0	5.79	1	1	1	1	0	0	0.929491	-0.932884	-0.726056	-0.114928	0.522740	-0.842807	0.088108	-0.060701	0.774992	-0.580296	0.279064	-0.105791	0.090236	-0.062781	-0.042130	-0.646590	0.286500	0.677719	-0.147090	-0.607709	-0.155483	-0.242816	0.540126	-0.168444	0.004474	0.120198	-0.226751	-0.051746	-0.326657	-0.151226	-0.290903	0.380676	0.479114	-0.127506	0.109911	-0.112562	0.113159	0.254298	0.505504	0.095957	0.006108	0.051951	-0.279050	0.089623	-0.136662	-0.127206	-0.204727	0.130743	0.196108	-0.118974	0.154051	0.366491	0.139337	-0.264714	-0.002097	0.026756	-0.092753	0.165461	0.122922	-0.114801	-0.255742	-0.007287	0.040428	0.111094	-0.127278	0.203404	-0.121484	-0.360422	0.007067	-0.168739	0.177675	0.144146	0.089702	0.159066	-0.018784	0.006026	-0.024198	-0.097841	0.115738	-0.132244	0.030508	0.075870	0.038705	-0.103404	-0.010762	0.074832	0.017567	-0.143105	0.204047	-0.037974	0.382128	0.219198	-0.133037	0.108723	0.076086	0.550112	-0.114863	-0.555480	0.314994	0.283080	-0.061591	0.049609	-0.198549	-0.269926	0.294977	0.122101	0.074594	0.209514	-0.074885	0.100454	-0.039714	-0.049927	0.133002	0.145055	0.225195	0.081718	0.017266	0.027202	-0.115488	0.114858	-0.019739	0.207222	-0.057635	0.103169	-0.168163	-0.174054	-0.082340	0.058241	0.031017	0.202666	0.294572	-0.071566	-0.247586	0.215816	-0.110945	-0.247787	-0.062410	-0.073789	-0.337171	-0.244705	0.124334	-0.174044	0.090301	-0.304759	-0.057585	-0.039488	0.036029	-0.010787	-0.046020	-0.039824	-0.136864	0.006366	0.015453	0.105975	-0.019245	0.073369	-0.301700	0.308045	-0.016559	0.224531	0.004264	-0.492750	0.278378	0.474025	-0.195136	-0.025745	-0.686718	-0.174993	0.080452	0.159062	0.006103	-0.051750	-0.026966	0.052650	-0.040076	0.079832	-0.052376	-0.055806	0.058457	0.065146	-0.096804	-0.023083	0.081917	-0.106719	0.014766	0.046211	0.085984	-0.018033	-0.112645	0.020637	-0.024384	0.050251	0.006728	0.105667	0.076595	0.171997	-0.317151	-0.216691	-0.235013	0.175122	-0.161660	-0.247641	0.209097	0.070422	-0.047798	-0.132193	0.318713	0.262168	-0.142461	-0.108562	-0.055686	0.253999	0.077172	-0.033976	0.153785	0.070240	-0.017721	0.142925	0.156166	0.217282	0.735767	0.239146	0.161499	0.041057	-0.108523	0.162034	-0.113066	0.063340	0.125382	0.052405	-0.023821	0.057092	0.020841	0.081334	-0.024063	0.068355	-0.003734	-0.024518	0	0	0	0	1	1	1
4	136.0	10.49	1	0	0	0	1	1	-0.270744	0.983164	0.345343	0.082186	-0.110755	-0.321691	0.829532	-0.277355	0.007374	0.067947	-0.415692	0.200043	0.014283	-0.210471	-0.731811	0.662589	-0.022502	-0.070575	0.245419	-0.547920	0.798233	-0.367135	-0.051795	-0.228293	0.048638	-0.170178	-0.147165	0.559245	0.290505	0.012682	0.367193	0.412463	-0.002623	0.451457	0.296433	-0.390191	0.236318	0.124057	-0.037063	-0.383591	0.556176	-0.070121	-0.199525	-0.115132	-0.090444	-0.061408	0.668433	-0.248972	0.267453	0.541314	0.177020	-0.150705	0.087225	-0.259116	-0.066941	-0.091808	0.027158	0.368062	0.151258	0.233684	0.174814	-0.048585	0.257596	0.059809	-0.178357	-0.016349	0.270663	-0.263091	0.235821	0.148781	0.207593	-0.270791	0.377389	-0.086384	-0.098895	0.379893	-0.272484	0.110565	-0.139379	0.184985	0.145482	-0.033085	-0.031522	-0.155692	-0.117188	-0.136447	0.131880	-0.100473	-0.140170	-0.192353	0.029345	-0.085022	0.101250	0.038953	-0.021798	0.240675	0.057040	0.118590	-0.299021	-0.018166	0.233100	0.416643	0.422783	0.081388	-0.255402	-0.320800	0.071673	-0.229848	-0.244175	-0.216232	-0.185579	-0.059756	-0.010926	0.164452	-0.105990	0.187536	-0.095631	-0.232195	0.277726	0.242024	0.415438	-0.346141	0.224576	0.185666	-0.279517	0.336853	0.500855	-0.200641	0.178764	0.408842	0.080450	0.206032	0.143937	-0.198519	0.356483	0.008578	0.286385	-0.150328	0.081217	0.162339	-0.048368	0.035324	0.160576	0.013535	-0.021063	-0.092352	-0.082244	0.098891	0.098428	-0.004358	0.114533	0.211289	0.295634	-0.084526	-0.053110	0.164938	0.090579	0.052545	-0.059209	0.113390	-0.142641	-0.128680	-0.025106	0.285039	0.025739	-0.012384	0.061502	-0.237029	0.381035	0.451849	-0.029450	-0.108372	-0.033305	-0.139967	0.275758	-0.598785	0.074246	-0.077904	-0.074210	0.290168	-0.211647	-0.131071	0.545578	-0.009969	0.030380	-0.177150	-0.243583	-0.093152	0.404141	0.136681	0.046191	0.099006	-0.129073	-0.067438	0.050372	0.061452	0.016641	0.029710	-0.014867	0.011403	0.078500	-0.102588	0.085155	0.014001	0.076067	0.236509	0.002467	0.012148	0.193899	-0.052725	-0.180092	0.135548	-0.062892	-0.151938	0.240784	0.190276	0.049228	0.192550	-0.107496	0.005394	-0.178423	-0.281391	0.134556	0.067239	-0.605912	0.052148	-0.041095	0.188161	0.462584	-0.067987	-0.153376	0.059805	-0.254568	0.153198	-0.199089	0.094935	-0.468095	-0.160779	0	0	0	0	0	0	0

	Cluster	Percentage_Positive_Reviews	Total_User_Reviews
0	1	86.0	235.0
1	3	90.0	85.0
2	1	85.0	225.0
3	0	81.0	16.0
4	2	84.0	136.0

Data Understanding and Preparation¶

Price of games¶

Transformation (Game Features)¶

PCA Transformation (Popular Tags)¶

Finish cleaning up the dataset¶

Modelling and Evaluation¶

Clustering for further analysis¶

	1980s	1990's	2.5D	2D	2D Fighter	2D Platformer	360 Video	3D	3D Fighter	3D Platformer	...	Well-Written	Werewolves	Western	Wholesome	Word Game	World War I	World War II	Wrestling	Zombies	eSports
0	0	0	0	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	1	1	1	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
43298	0	0	0	1	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
43299	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
43300	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
43301	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
43302	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	0	0	0	0

	Cluster	Median	Mean	Std
0	0	75.0	72.449457	19.681472
1	1	84.0	80.881622	14.790937
2	2	83.0	79.108353	15.834017
3	3	80.0	76.671369	16.911836