# Imports
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt
from statistics import mean, median
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr


# Increase the max number of columns and rows which can be displayed
pd.set_option("max_columns", 500)
pd.set_option("max_rows", 500)
# Load the steam_games csv into a pandas dataframe
games_df = pd.read_csv('steam_games.csv')
# Load the steam_requirements_data into a pandas dataframe
requirements_df = pd.read_csv('steam_requirements_data.csv')
print("Steam games data:")
display(games_df.head(10))
print("Steam game requirements data:")
display(requirements_df.head(10))
print("Value counts of required_age")
print(games_df["required_age"].value_counts())

Steam games data:

Steam game requirements data:

Value counts of required_age
0     26479
18      308
16      192
12       73
7        12
3        11
Name: required_age, dtype: int64


def transform_owners(x):
    split = x.split("-")
    low = int(split[0])
    high = int(split[1])
    return int((low + high) * 0.5)
def transform_reqs(reqs):
    if (reqs is not np.nan):
        # Remove spaces and make the string lowercase
        reqs = reqs.replace(" ", "")
        reqs = reqs.lower()
        # Find how much ram and the unit (mb or gb)
        res = re.search('[0-9]+(mbram|gbram)', reqs)
        # If found
        if res:
            # The string found (e.g., 96mbram, 1gbram)
            res_str = res.group()
            # If mb, convert to gb and return
            if 'mb' in res_str:
                # Extract the number
                res = re.search('[0-9]+', res_str)
                number = int(res.group())
                # Convert to gb
                return number / 1000
            elif 'gb' in res_str:
                # Extract the number
                res = re.search('[0-9]+', res_str)
                return int(res.group())
        else:
            return np.nan
    else:
        return np.nan

games_df["release_date"] = games_df["release_date"].apply(lambda x: int(x[0:4]))
games_df["owners"] = games_df["owners"].apply(transform_owners)
games_df["rating"] = games_df["positive_ratings"] / (games_df["positive_ratings"] + games_df["negative_ratings"])
# Join the two dataframes on appid
games_df = games_df.join(requirements_df.set_index("steam_appid"), on=["appid"])
# Drop unneeded columns mentioned above
games_df.drop(columns=['english', 'achievements', 'required_age', 'positive_ratings', 'negative_ratings', 'steamspy_tags'], inplace=True)
# Drop unneeded columns resulting from joining the 2 dataframes
games_df.drop(columns=['pc_requirements', 'mac_requirements', 'linux_requirements', 'recommended'], inplace=True)
games_df["minimum"] = games_df["minimum"].apply(transform_reqs)
# Raname "minimum" to "min_req_ram"
games_df.rename(columns={'minimum': 'min_req_ram'}, inplace=True)
games_df["est_revenue"] = games_df["owners"] * games_df["price"]
display(games_df.head(10))


# A helper function used to see the unique elements over an entire column in games_df.
# Each element of the column should be a list e.g., ['Action', 'Indie']
def unique_col_elts(column_name):
    col_as_list = [x for x in games_df[column_name]]
    new = []

    for e in col_as_list:
        for e1 in e:
            if e1 not in new:
                new.append(e1)

    print("Unique entries:")
    print(new)
    print("Length:")
    print(len(new))

# Turn platforms column into a list splitting on ';' in order to do one-hot encoding
games_df["platforms"] = games_df["platforms"].apply(lambda x: x.split(';'))
# One-hot encode platforms
mlb = MultiLabelBinarizer()
mlb.fit(games_df['platforms'])
new_col_names = ["platform_%s" % c for c in mlb.classes_]
# Create new DataFrame with one-hot encoded platforms
platforms = pd.DataFrame(mlb.fit_transform(games_df['platforms']), columns=new_col_names, index=games_df['platforms'].index)
# Join platforms into games_df
games_df = games_df.join(platforms)

# Turn genres column into a list splitting on ';' in order to do one-hot encoding
games_df["genres"] = games_df["genres"].apply(lambda x: x.split(';'))
# One-hot encode genres
mlb = MultiLabelBinarizer()
mlb.fit(games_df['genres'])
new_col_names = ["genre_%s" % c for c in mlb.classes_]
# Create new DataFrame with one-hot encoded genres
genres = pd.DataFrame(mlb.fit_transform(games_df['genres']), columns=new_col_names, index=games_df['genres'].index)
# Join genres into games_df
games_df = games_df.join(genres)

# Turn categories column into a list splitting on ';' in order to do one-hot encoding
games_df["categories"] = games_df["categories"].apply(lambda x: x.split(';'))
# One-hot encode categories
mlb = MultiLabelBinarizer()
mlb.fit(games_df['categories'])
new_col_names = ["category_%s" % c for c in mlb.classes_]
# Create new DataFrame with one-hot encoded categories
categories = pd.DataFrame(mlb.fit_transform(games_df['categories']), columns=new_col_names, index=games_df['categories'].index)
# Join categories into games_df
games_df = games_df.join(categories)

display(games_df.head(10))
print("Dimensionality of DataFrame:")
print(games_df.shape)

Dimensionality of DataFrame:
(27075, 76)


unique_col_elts('genres')

Unique entries:
['Action', 'Free to Play', 'Strategy', 'Adventure', 'Indie', 'RPG', 'Animation & Modeling', 'Video Production', 'Casual', 'Simulation', 'Racing', 'Violent', 'Massively Multiplayer', 'Nudity', 'Sports', 'Early Access', 'Gore', 'Utilities', 'Design & Illustration', 'Web Publishing', 'Education', 'Software Training', 'Sexual Content', 'Audio Production', 'Game Development', 'Photo Editing', 'Accounting', 'Documentary', 'Tutorial']
Length:
29


plt.figure(figsize=(5,5))
plt.xlabel("appid", fontsize=16)
plt.ylabel("Revenue (in GBP)", fontsize=16)
plt.title("Revenue Per Game", fontsize=16)
plt.scatter(x=games_df['appid'], y=games_df['est_revenue'])
plt.show()


genres = ['Action', 'Free to Play', 'Strategy', 'Adventure', 'Indie', 'RPG', 'Casual', 'Simulation', 'Racing', 'Violent', \
    'Massively Multiplayer', 'Nudity', 'Sports', 'Early Access', 'Gore', 'Education', 'Sexual Content']

plt.figure(figsize=(30,10))
plt.xlabel("Genre", fontsize=16)
plt.ylabel("Average Revenue (in GBP)", fontsize=16)
plt.title("Average Revenue Per Genre", fontsize=16)

genre_to_rev = {}
# For each genre we will plot the average revenue
for genre in genres:
    # Filter the df to just include the rows/games which are the genre 'genre'
    filtered_df = games_df.loc[games_df["genre_" + genre] == 1]
    # There is only one game with a revenue of more than 0.5*10^9 (has value 2*10^9)
    # We can drop this outlier.
    filtered_df = filtered_df.drop(filtered_df[filtered_df["est_revenue"] > 0.5*10**9].index)
    # Round to avoid long chart labels
    genre_to_rev.update({genre : round(mean(filtered_df['est_revenue']), 2)})

barplot = plt.bar(genre_to_rev.keys(), genre_to_rev.values(), color='green')
plt.bar_label(barplot, labels=genre_to_rev.values(), fontsize=14)
plt.show()

plt.figure(figsize=(30,10))
plt.xlabel("Genre", fontsize=16)
plt.ylabel("Median Revenue (in GBP)", fontsize=16)
plt.title("Median Revenue Per Genre", fontsize=16)

genre_to_rev = {}
# For each genre we will plot the median revenue
for genre in genres:
    # Filter the df to just include the rows/games which are the genre 'genre'
    filtered_df = games_df.loc[games_df["genre_" + genre] == 1]
    # There is only one game with a revenue of more than 0.5*10^9 (has value 2*10^9)
    # We can drop this outlier.
    filtered_df = filtered_df.drop(filtered_df[filtered_df["est_revenue"] > 0.5*10**9].index)
    # Round to avoid long chart labels
    genre_to_rev.update({genre : round(median(filtered_df['est_revenue']), 2)})

barplot = plt.bar(genre_to_rev.keys(), genre_to_rev.values(), color='green')
plt.bar_label(barplot, labels=genre_to_rev.values(), fontsize=14)
plt.show()


print("The number of Education games:")
print(sum(games_df['genre_Education']))
print("The number of Massively Multiplayer games:")
print(sum(games_df["genre_Massively Multiplayer"]))
print("The number of Free Massively Multiplayer games:")
print(len(games_df.loc[(games_df["genre_Massively Multiplayer"] == 1) & (games_df["price"] == 0)]))

The number of Education games:
51
The number of Massively Multiplayer games:
723
The number of Free Massively Multiplayer games:
375


genres = ['Action', 'Free to Play', 'Strategy', 'Adventure', 'Indie', 'RPG', 'Casual', 'Simulation', 'Racing', 'Violent', \
    'Massively Multiplayer', 'Nudity', 'Sports', 'Early Access', 'Gore', 'Education', 'Sexual Content']

plt.figure(figsize=(30,10))
plt.xlabel("Genre", fontsize=16)
plt.ylabel("Average Playtime (in Minutes)", fontsize=16)
plt.title("Average Playtime Per Genre (in Minutes)", fontsize=16)

genre_to_playtime = {}
# For each genre we will plot the average playtime
for genre in genres:
    # Filter the df to just include the rows/games which are the genre 'genre'
    filtered_df = games_df.loc[games_df["genre_" + genre] == 1]
    # Round to avoid long chart labels
    genre_to_playtime.update({genre : round(mean(filtered_df['average_playtime']), 2)})

barplot = plt.bar(genre_to_playtime.keys(), genre_to_playtime.values(), color='teal')
plt.bar_label(barplot, labels=genre_to_playtime.values(), fontsize=14)
plt.show()


plt.figure(figsize=(30,10))
plt.xlabel("Free Genres", fontsize=16)
plt.ylabel("Average Playtime (in Minutes)", fontsize=16)
plt.title("Average Playtime Per Free Genre (in Minutes)", fontsize=16)

free_games = games_df.loc[games_df["genre_Free to Play"] == 1]
genre_to_playtime = {}
for genre in genres:
    # Filter the df to just include the rows/games which are the genre 'genre'
    free_genre_games = free_games.loc[games_df["genre_" + genre] == 1]
    # Round to avoid long chart labels
    genre_to_playtime.update({genre : round(mean(free_genre_games['average_playtime']), 2)})

barplot = plt.bar(genre_to_playtime.keys(), genre_to_playtime.values(), color='teal')
plt.bar_label(barplot, labels=genre_to_playtime.values(), fontsize=14)
plt.show()


genres = ['Action', 'Free to Play', 'Strategy', 'Adventure', 'Indie', 'RPG', 'Casual', 'Simulation', 'Racing', 'Violent', \
    'Massively Multiplayer', 'Nudity', 'Sports', 'Early Access', 'Gore', 'Education', 'Sexual Content']

plt.figure(figsize=(30,10))
plt.xlabel("Genre", fontsize=16)
plt.ylabel("Average Rating", fontsize=16)
plt.title("Average Rating Per Genre", fontsize=16)

genre_to_rating = {}
# For each genre we will plot the average rating
for genre in genres:
    # Filter the df to just include the rows/games which are the genre 'genre'
    filtered_df = games_df.loc[games_df["genre_" + genre] == 1]
    # Round to avoid long chart labels
    genre_to_rating.update({genre : round(mean(filtered_df['rating']), 4)})

# Sort genre_to_rating in descending order
sorted_dict = dict(sorted(genre_to_rating.items(), key=lambda x:x[1], reverse=True))
barplot = plt.bar(sorted_dict.keys(), sorted_dict.values(), color='gold')
plt.bar_label(barplot, labels=sorted_dict.values(), fontsize=14)
plt.ylim([0, 1])
plt.show()


print(unique_col_elts('categories'))

categories = ['Multi-player', 'Online Multi-Player', 'Local Multi-Player', 'Single-player', 'Partial Controller Support',
'Cross-Platform Multiplayer', 'Includes level editor', 'In-App Purchases', 'Co-op', 'Full controller support',
'Online Co-op', 'Shared/Split Screen', 'Local Co-op', 'MMO', 'VR Support', 'Mods', 'Mods (require HL2)']
for cat in categories:
    print("Num entries of category " + cat + ":", len(games_df.loc[games_df["category_" + cat] == 1]))

Unique entries:
['Multi-player', 'Online Multi-Player', 'Local Multi-Player', 'Valve Anti-Cheat enabled', 'Single-player', 'Steam Cloud', 'Steam Achievements', 'Steam Trading Cards', 'Captions available', 'Partial Controller Support', 'Includes Source SDK', 'Cross-Platform Multiplayer', 'Stats', 'Commentary available', 'Includes level editor', 'Steam Workshop', 'In-App Purchases', 'Co-op', 'Full controller support', 'Steam Leaderboards', 'SteamVR Collectibles', 'Online Co-op', 'Shared/Split Screen', 'Local Co-op', 'MMO', 'VR Support', 'Mods', 'Mods (require HL2)', 'Steam Turn Notifications']
Length:
29
None
Num entries of category Multi-player: 3974
Num entries of category Online Multi-Player: 2487
Num entries of category Local Multi-Player: 1615
Num entries of category Single-player: 25678
Num entries of category Partial Controller Support: 4234
Num entries of category Cross-Platform Multiplayer: 1081
Num entries of category Includes level editor: 1036
Num entries of category In-App Purchases: 690
Num entries of category Co-op: 1721
Num entries of category Full controller support: 5695
Num entries of category Online Co-op: 1071
Num entries of category Shared/Split Screen: 2152
Num entries of category Local Co-op: 1059
Num entries of category MMO: 421
Num entries of category VR Support: 231
Num entries of category Mods: 2
Num entries of category Mods (require HL2): 1


categories = ['Multi-player', 'Online Multi-Player', 'Local Multi-Player', 'Single-player', 'Partial Controller Support',
'Cross-Platform Multiplayer', 'Includes level editor', 'In-App Purchases', 'Co-op', 'Full controller support',
'Online Co-op', 'Shared/Split Screen', 'Local Co-op', 'MMO', 'VR Support']

plt.figure(figsize=(38,10))
plt.xlabel("Category", fontsize=16)
plt.ylabel("Average Revenue", fontsize=16)
plt.title("Average Revenue Per Category", fontsize=16)

cat_to_rev = {}
# For each category we will plot the average revenue
for cat in categories:
    # Filter the df to just include the rows/games which are the genre 'genre'
    filtered_df = games_df.loc[games_df["category_" + cat] == 1]
    # There is only one game with a revenue of more than 0.5*10^9 (has value 2*10^9)
    # We can drop this outlier.
    filtered_df = filtered_df.drop(filtered_df[filtered_df["est_revenue"] > 0.5*10**9].index)
    # Round to avoid long chart labels
    cat_to_rev.update({cat : round(mean(filtered_df['est_revenue']), 2)})

barplot = plt.bar(cat_to_rev.keys(), cat_to_rev.values(), color='green')
plt.bar_label(barplot, labels=cat_to_rev.values(), fontsize=14)
plt.show()

plt.figure(figsize=(38,10))
plt.xlabel("Category", fontsize=16)
plt.ylabel("Median Revenue", fontsize=16)
plt.title("Median Revenue Per Category", fontsize=16)

cat_to_rev = {}
# For each category we will plot the median revenue
for cat in categories:
    # Filter the df to just include the rows/games which are the genre 'genre'
    filtered_df = games_df.loc[games_df["category_" + cat] == 1]
    # There is only one game with a revenue of more than 0.5*10^9 (has value 2*10^9)
    # We can drop this outlier.
    filtered_df = filtered_df.drop(filtered_df[filtered_df["est_revenue"] > 0.5*10**9].index)
    # Round to avoid long chart labels
    cat_to_rev.update({cat : round(median(filtered_df['est_revenue']), 2)})

barplot = plt.bar(cat_to_rev.keys(), cat_to_rev.values(), color='green')
plt.bar_label(barplot, labels=cat_to_rev.values(), fontsize=14)
plt.show()
print("Number of 'Includes level editor' games: ", len(games_df.loc[games_df["category_Includes level editor"] == 1]))

Number of 'Includes level editor' games:  1036


plt.figure(figsize=(38,10))
plt.xlabel("Category", fontsize=16)
plt.ylabel("Average Playtime (in Minutes)", fontsize=16)
plt.title("Average Playtime Per Category", fontsize=16)

cat_to_playtime = {}
# For each category we will plot the median revenue
for cat in categories:
    # Filter the df to just include the rows/games which are the genre 'genre'
    filtered_df = games_df.loc[games_df["category_" + cat] == 1]
    # Round to avoid long chart labels
    cat_to_playtime.update({cat : round(mean(filtered_df['average_playtime']), 2)})

barplot = plt.bar(cat_to_playtime.keys(), cat_to_playtime.values(), color='teal')
plt.bar_label(barplot, labels=cat_to_playtime.values(), fontsize=14)
plt.show()


plt.figure(figsize=(38,10))
plt.xlabel("Category", fontsize=16)
plt.ylabel("Average Rating", fontsize=16)
plt.title("Average Rating Per Category", fontsize=16)

cat_to_rating = {}
# For each category we will plot the average rating
for cat in categories:
    # Filter the df to just include the rows/games which are the category 'cat'
    filtered_df = games_df.loc[games_df["category_" + cat] == 1]
    # Round to avoid long chart labels
    cat_to_rating.update({cat : round(mean(filtered_df['rating']), 4)})

# Sort cat_to_rating in descending order
sorted_dict = dict(sorted(cat_to_rating.items(), key=lambda x:x[1], reverse=True))
barplot = plt.bar(sorted_dict.keys(), sorted_dict.values(), color='gold')
plt.bar_label(barplot, labels=sorted_dict.values(), fontsize=14)
plt.ylim([0, 1])
plt.show()


# Drop the major outlier
filtered_df = games_df.drop(games_df[games_df["est_revenue"] > 0.5*10**9].index)
# Drop those with a price > 100 as there are only a few games with that price and it makes our graph unreadable
filtered_df = filtered_df.drop(filtered_df[filtered_df["price"] > 100].index)
# Set up graph
plt.figure(figsize=(10,10))
plt.xlabel("Price (in GBP)", fontsize=12)
plt.ylabel("Revenue (in GBP)", fontsize=12)
plt.title("Revenue of Game vs Price", fontsize=12)
# Scatter plot of est_revenue vs price
plt.scatter(filtered_df['price'], filtered_df['est_revenue'])
# Now we fit a linear regression line to the graph
linreg = LinearRegression()
# The training values we use are the entire set of prices and revenues
x_train = np.array(filtered_df["price"]).reshape(-1, 1)
y_train = np.array(filtered_df["est_revenue"])
lin_model = linreg.fit(x_train, y_train)
# Predict y (revenue) from x (price)
plt.plot(filtered_df["price"], lin_model.predict(x_train), color='orange')
plt.show()
print("Slope of line:")
print(linreg.coef_)
# Evaluate how accurate our linear model is
print("R Squared: ", lin_model.score(x_train, y_train))
# Evaluate a relationship between x and y
corr, p_value = spearmanr(x_train, y_train)
print("Spearman Rank Correlation:", corr)
print("p-value:", p_value)

Slope of line:
[335584.10169458]
R Squared:  0.07756478770143238
Spearman Rank Correlation: 0.8367535231200439
p-value: 0.0


# There are a few games with an average playtime of over 50,000 minutes. These are extreme outliers
# and may even be errors.
filtered_df = games_df.drop(games_df[games_df["average_playtime"] > 50_000].index)
# Drop those with a price > 100 as there are only a few games with that price and it makes our graph unreadable
filtered_df = filtered_df.drop(filtered_df[filtered_df["price"] > 100].index)
# Set up graph
plt.figure(figsize=(10,10))
plt.xlabel("Price (in GBP)", fontsize=12)
plt.ylabel("Playtime (in minutes)", fontsize=12)
plt.title("Playtime of Game vs Price", fontsize=12)
# Scatter plot of playtime vs price
plt.scatter(filtered_df['price'], filtered_df['average_playtime'])
# Now we fit a linear regression line to the graph
linreg = LinearRegression()
# The training values we use are the entire set of prices and playtimes
x_train = np.array(filtered_df["price"]).reshape(-1, 1)
y_train = np.array(filtered_df["average_playtime"])
lin_model = linreg.fit(x_train, y_train)
# Predict y (average_playtime) from x (price)
plt.plot(filtered_df["price"], lin_model.predict(x_train), color='orange')
plt.show()
print("Slope of line:")
print(linreg.coef_)
# Evaluate how accurate our linear model is
print("R Squared: ", lin_model.score(x_train, y_train))
# Evaluate a relationship between x and y
corr, p_value = spearmanr(x_train, y_train)
print("Spearman Rank Correlation:", corr)
print("p-value:", p_value)

Slope of line:
[17.21861957]
R Squared:  0.013777200440742932
Spearman Rank Correlation: 0.08010711260585777
p-value: 9.048357111488325e-40


# Drop those with a price > 100 as there are only a few games with that price and it makes our graph unreadable
filtered_df = filtered_df.drop(filtered_df[filtered_df["price"] > 100].index)
# Set up graph
plt.figure(figsize=(10,10))
plt.xlabel("Price (in GBP)", fontsize=12)
plt.ylabel("Rating", fontsize=12)
plt.title("Rating of Game vs Price", fontsize=12)
# Scatter plot of rating vs price
plt.scatter(filtered_df['price'], filtered_df['rating'])
# Now we fit a linear regression line to the graph
linreg = LinearRegression()
# The training values we use are the entire set of prices and ratings
x_train = np.array(filtered_df["price"]).reshape(-1, 1)
y_train = np.array(filtered_df["rating"])
lin_model = linreg.fit(x_train, y_train)
# Predict y (rating) from x (price)
plt.plot(filtered_df["price"], lin_model.predict(x_train), color='orange')
plt.show()
print("Slope of line:")
print(linreg.coef_)
# Evaluate how accurate our linear model is
print("R Squared: ", lin_model.score(x_train, y_train))
# Evaluate a relationship between x and y
corr, p_value = spearmanr(x_train, y_train)
print("Spearman Rank Correlation:", corr)
print("p-value:", p_value)

Slope of line:
[0.00306605]
R Squared:  0.007689622315858458
Spearman Rank Correlation: 0.12216057444578153
p-value: 1.8246289754049238e-90


# Drop the major outlier
filtered_df = games_df.drop(games_df[games_df["est_revenue"] > 0.5*10**9].index)
# Drop those with > 16 GB ram required (only a few and some of the extreme outliers make the graph unreadable)
filtered_df = filtered_df.drop(filtered_df[filtered_df["min_req_ram"] > 16].index)
# Drop those with NaN in min_req_ram (about 2.5k entries)
filtered_df = filtered_df.dropna(subset = ['min_req_ram'])
# Set up graph
plt.figure(figsize=(10,10))
plt.xlabel("RAM Requirement (in GB)", fontsize=12)
plt.ylabel("Revenue (in GBP)", fontsize=12)
plt.title("Revenue of Game vs RAM Requirement", fontsize=12)
# Scatter plot of revenue vs RAM
plt.scatter(filtered_df['min_req_ram'], filtered_df['est_revenue'])
# Now we fit a linear regression line to the graph
linreg = LinearRegression()
# The training values we use are the entire set of RAM and revenue
x_train = np.array(filtered_df["min_req_ram"]).reshape(-1, 1)
y_train = np.array(filtered_df["est_revenue"])
lin_model = linreg.fit(x_train, y_train)
# Predict y (est_revenue) from x (min_req_ram)
plt.plot(filtered_df["min_req_ram"], lin_model.predict(x_train), color='orange')
plt.show()
print("Slope of line:")
print(linreg.coef_)
# Evaluate how accurate our linear model is
print("R Squared: ", lin_model.score(x_train, y_train))
# Evaluate a relationship between x and y
corr, p_value = spearmanr(x_train, y_train)
print("Spearman Rank Correlation:", corr)
print("p-value:", p_value)

Slope of line:
[190191.19552378]
R Squared:  0.00289594821321415
Spearman Rank Correlation: 0.06251399865790182
p-value: 9.087453487640155e-23


# Drop those with > 16 GB ram required (only a few and some of the extreme outliers make the graph unreadable)
filtered_df = games_df.drop(games_df[games_df["min_req_ram"] > 16].index)
# There are a few games with an average playtime of over 50,000 minutes. These are extreme outliers
# and may even be errors.
filtered_df = filtered_df.drop(filtered_df[filtered_df["average_playtime"] > 50_000].index)
# Drop those with NaN in min_req_ram (about 2.5k entries)
filtered_df = filtered_df.dropna(subset = ['min_req_ram'])
# Set up graph
plt.figure(figsize=(10,10))
plt.xlabel("RAM Requirement (in GB)", fontsize=12)
plt.ylabel("Playtime (in minutes)", fontsize=12)
plt.title("Playtime of Game vs RAM Requirement", fontsize=12)
# Scatter plot of playtime vs RAM
plt.scatter(filtered_df['min_req_ram'], filtered_df['average_playtime'])
# Now we fit a linear regression line to the graph
linreg = LinearRegression()
# The training values we use are the entire set of RAM and playtimes
x_train = np.array(filtered_df["min_req_ram"]).reshape(-1, 1)
y_train = np.array(filtered_df["average_playtime"])
lin_model = linreg.fit(x_train, y_train)
# Predict y (average_playtime) from x (min_req_ram)
plt.plot(filtered_df["min_req_ram"], lin_model.predict(x_train), color='orange')
plt.show()
print("Slope of line:")
print(linreg.coef_)
# Evaluate how accurate our linear model is
print("R Squared: ", lin_model.score(x_train, y_train))
# Evaluate a relationship between x and y
corr, p_value = spearmanr(x_train, y_train)
print("Spearman Rank Correlation:", corr)
print("p-value:", p_value)

Slope of line:
[5.64626105]
R Squared:  0.00017148913340880867
Spearman Rank Correlation: -0.04926200841772169
p-value: 1.0265072179264815e-14


# Drop those with > 16 GB ram required (only a few and some of the extreme outliers make the graph unreadable)
filtered_df = games_df.drop(games_df[games_df["min_req_ram"] > 16].index)
# Drop those with NaN in min_req_ram (about 2.5k entries)
filtered_df = filtered_df.dropna(subset = ['min_req_ram'])
# Set up graph
plt.figure(figsize=(10,10))
plt.xlabel("RAM Requirement (in GB)", fontsize=12)
plt.ylabel("Rating", fontsize=12)
plt.title("Rating of Game vs RAM Requirement", fontsize=12)
# Scatter plot of rating vs RAM
plt.scatter(filtered_df['min_req_ram'], filtered_df['rating'])
# Now we fit a linear regression line to the graph
linreg = LinearRegression()
# The training values we use are the entire set of RAM and ratings
x_train = np.array(filtered_df["min_req_ram"]).reshape(-1, 1)
y_train = np.array(filtered_df["rating"])
lin_model = linreg.fit(x_train, y_train)
# Predict y (rating) from x (min_req_ram)
plt.plot(filtered_df["min_req_ram"], lin_model.predict(x_train), color='orange')
plt.show()
print("Slope of line:")
print(linreg.coef_)
# Evaluate how accurate our linear model is
print("R Squared: ", lin_model.score(x_train, y_train))
# Evaluate a relationship between x and y
corr, p_value = spearmanr(x_train, y_train)
print("Spearman Rank Correlation:", corr)
print("p-value:", p_value)

Slope of line:
[-0.00606724]
R Squared:  0.0033711174148746137
Spearman Rank Correlation: -0.05310791389951006
p-value: 7.322090104562239e-17

	appid	name	release_date	english	developer	publisher	platforms	categories	genres	steamspy_tags	achievements	positive_ratings	negative_ratings	average_playtime	median_playtime	owners	price
0	10	Counter-Strike	2000-11-01	1	Valve	Valve	windows;mac;linux	Multi-player;Online Multi-Player;Local Multi-P...	Action	Action;FPS;Multiplayer	0	124534	3339	17612	317	10000000-20000000	7.19
1	20	Team Fortress Classic	1999-04-01	1	Valve	Valve	windows;mac;linux	Multi-player;Online Multi-Player;Local Multi-P...	Action	Action;FPS;Multiplayer	0	3318	633	277	62	5000000-10000000	3.99
2	30	Day of Defeat	2003-05-01	1	Valve	Valve	windows;mac;linux	Multi-player;Valve Anti-Cheat enabled	Action	FPS;World War II;Multiplayer	0	3416	398	187	34	5000000-10000000	3.99
3	40	Deathmatch Classic	2001-06-01	1	Valve	Valve	windows;mac;linux	Multi-player;Online Multi-Player;Local Multi-P...	Action	Action;FPS;Multiplayer	0	1273	267	258	184	5000000-10000000	3.99
4	50	Half-Life: Opposing Force	1999-11-01	1	Gearbox Software	Valve	windows;mac;linux	Single-player;Multi-player;Valve Anti-Cheat en...	Action	FPS;Action;Sci-fi	0	5250	288	624	415	5000000-10000000	3.99
5	60	Ricochet	2000-11-01	1	Valve	Valve	windows;mac;linux	Multi-player;Online Multi-Player;Valve Anti-Ch...	Action	Action;FPS;Multiplayer	0	2758	684	175	10	5000000-10000000	3.99
6	70	Half-Life	1998-11-08	1	Valve	Valve	windows;mac;linux	Single-player;Multi-player;Online Multi-Player...	Action	FPS;Classic;Action	0	27755	1100	1300	83	5000000-10000000	7.19
7	80	Counter-Strike: Condition Zero	2004-03-01	1	Valve	Valve	windows;mac;linux	Single-player;Multi-player;Valve Anti-Cheat en...	Action	Action;FPS;Multiplayer	0	12120	1439	427	43	10000000-20000000	7.19
8	130	Half-Life: Blue Shift	2001-06-01	1	Gearbox Software	Valve	windows;mac;linux	Single-player	Action	FPS;Action;Sci-fi	0	3822	420	361	205	5000000-10000000	3.99
9	220	Half-Life 2	2004-11-16	1	Valve	Valve	windows;mac;linux	Single-player;Steam Achievements;Steam Trading...	Action	FPS;Action;Sci-fi	33	67902	2419	691	402	10000000-20000000	7.19

	steam_appid	pc_requirements	mac_requirements	linux_requirements	minimum	recommended
0	10	{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...	{'minimum': 'Minimum: OS X Snow Leopard 10.6....	{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...	500 mhz processor, 96mb ram, 16mb video card, ...	NaN
1	20	{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...	{'minimum': 'Minimum: OS X Snow Leopard 10.6....	{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...	500 mhz processor, 96mb ram, 16mb video card, ...	NaN
2	30	{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...	{'minimum': 'Minimum: OS X Snow Leopard 10.6....	{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...	500 mhz processor, 96mb ram, 16mb video card, ...	NaN
3	40	{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...	{'minimum': 'Minimum: OS X Snow Leopard 10.6....	{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...	500 mhz processor, 96mb ram, 16mb video card, ...	NaN
4	50	{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...	{'minimum': 'Minimum: OS X Snow Leopard 10.6....	{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...	500 mhz processor, 96mb ram, 16mb video card, ...	NaN
5	60	{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...	{'minimum': 'Minimum: OS X Snow Leopard 10.6....	{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...	500 mhz processor, 96mb ram, 16mb video card, ...	NaN
6	70	{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...	{'minimum': 'Minimum: OS X Snow Leopard 10.6....	{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...	500 mhz processor, 96mb ram, 16mb video card, ...	NaN
7	80	{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...	[]	[]	500 mhz processor, 96mb ram, 16mb video card, ...	NaN
8	130	{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...	{'minimum': 'Minimum: OS X Snow Leopard 10.6....	{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...	500 mhz processor, 96mb ram, 16mb video card, ...	NaN
9	220	{'minimum': '<strong>Minimum:</strong><br><ul ...	{'minimum': '<strong>Minimum:</strong><br><ul ...	[]	OS: Windows 7, Vista, XP Processor: 1.7 Ghz Me...	NaN

	appid	name	release_date	developer	publisher	platforms	categories	genres	average_playtime	median_playtime	owners	price	rating	min_req_ram	est_revenue
0	10	Counter-Strike	2000	Valve	Valve	windows;mac;linux	Multi-player;Online Multi-Player;Local Multi-P...	Action	17612	317	15000000	7.19	0.973888	0.096	107850000.0
1	20	Team Fortress Classic	1999	Valve	Valve	windows;mac;linux	Multi-player;Online Multi-Player;Local Multi-P...	Action	277	62	7500000	3.99	0.839787	0.096	29925000.0
2	30	Day of Defeat	2003	Valve	Valve	windows;mac;linux	Multi-player;Valve Anti-Cheat enabled	Action	187	34	7500000	3.99	0.895648	0.096	29925000.0
3	40	Deathmatch Classic	2001	Valve	Valve	windows;mac;linux	Multi-player;Online Multi-Player;Local Multi-P...	Action	258	184	7500000	3.99	0.826623	0.096	29925000.0
4	50	Half-Life: Opposing Force	1999	Gearbox Software	Valve	windows;mac;linux	Single-player;Multi-player;Valve Anti-Cheat en...	Action	624	415	7500000	3.99	0.947996	0.096	29925000.0
5	60	Ricochet	2000	Valve	Valve	windows;mac;linux	Multi-player;Online Multi-Player;Valve Anti-Ch...	Action	175	10	7500000	3.99	0.801278	0.096	29925000.0
6	70	Half-Life	1998	Valve	Valve	windows;mac;linux	Single-player;Multi-player;Online Multi-Player...	Action	1300	83	7500000	7.19	0.961878	0.096	53925000.0
7	80	Counter-Strike: Condition Zero	2004	Valve	Valve	windows;mac;linux	Single-player;Multi-player;Valve Anti-Cheat en...	Action	427	43	15000000	7.19	0.893871	0.096	107850000.0
8	130	Half-Life: Blue Shift	2001	Gearbox Software	Valve	windows;mac;linux	Single-player	Action	361	205	7500000	3.99	0.900990	0.096	29925000.0
9	220	Half-Life 2	2004	Valve	Valve	windows;mac;linux	Single-player;Steam Achievements;Steam Trading...	Action	691	402	15000000	7.19	0.965601	0.512	107850000.0

	appid	name	release_date	developer	publisher	platforms	categories	genres	average_playtime	median_playtime	owners	price	rating	min_req_ram	est_revenue	platform_linux	platform_mac	platform_windows	genre_Action	category_Captions available	category_Includes Source SDK	category_Local Multi-Player	category_Multi-player	category_Online Multi-Player	category_Partial Controller Support	category_Single-player	category_Steam Achievements	category_Steam Cloud	category_Steam Trading Cards	category_Valve Anti-Cheat enabled
0	10	Counter-Strike	2000	Valve	Valve	[windows, mac, linux]	[Multi-player, Online Multi-Player, Local Mult...	[Action]	17612	317	15000000	7.19	0.973888	0.096	107850000.0	1	1	1	1	0	0	1	1	1	0	0	0	0	0	1
1	20	Team Fortress Classic	1999	Valve	Valve	[windows, mac, linux]	[Multi-player, Online Multi-Player, Local Mult...	[Action]	277	62	7500000	3.99	0.839787	0.096	29925000.0	1	1	1	1	0	0	1	1	1	0	0	0	0	0	1
2	30	Day of Defeat	2003	Valve	Valve	[windows, mac, linux]	[Multi-player, Valve Anti-Cheat enabled]	[Action]	187	34	7500000	3.99	0.895648	0.096	29925000.0	1	1	1	1	0	0	0	1	0	0	0	0	0	0	1
3	40	Deathmatch Classic	2001	Valve	Valve	[windows, mac, linux]	[Multi-player, Online Multi-Player, Local Mult...	[Action]	258	184	7500000	3.99	0.826623	0.096	29925000.0	1	1	1	1	0	0	1	1	1	0	0	0	0	0	1
4	50	Half-Life: Opposing Force	1999	Gearbox Software	Valve	[windows, mac, linux]	[Single-player, Multi-player, Valve Anti-Cheat...	[Action]	624	415	7500000	3.99	0.947996	0.096	29925000.0	1	1	1	1	0	0	0	1	0	0	1	0	0	0	1
5	60	Ricochet	2000	Valve	Valve	[windows, mac, linux]	[Multi-player, Online Multi-Player, Valve Anti...	[Action]	175	10	7500000	3.99	0.801278	0.096	29925000.0	1	1	1	1	0	0	0	1	1	0	0	0	0	0	1
6	70	Half-Life	1998	Valve	Valve	[windows, mac, linux]	[Single-player, Multi-player, Online Multi-Pla...	[Action]	1300	83	7500000	7.19	0.961878	0.096	53925000.0	1	1	1	1	0	0	0	1	1	0	1	0	1	0	1
7	80	Counter-Strike: Condition Zero	2004	Valve	Valve	[windows, mac, linux]	[Single-player, Multi-player, Valve Anti-Cheat...	[Action]	427	43	15000000	7.19	0.893871	0.096	107850000.0	1	1	1	1	0	0	0	1	0	0	1	0	0	0	1
8	130	Half-Life: Blue Shift	2001	Gearbox Software	Valve	[windows, mac, linux]	[Single-player]	[Action]	361	205	7500000	3.99	0.900990	0.096	29925000.0	1	1	1	1	0	0	0	0	0	0	1	0	0	0	0
9	220	Half-Life 2	2004	Valve	Valve	[windows, mac, linux]	[Single-player, Steam Achievements, Steam Trad...	[Action]	691	402	15000000	7.19	0.965601	0.512	107850000.0	1	1	1	1	1	1	0	0	0	1	1	1	1	1	0

Predicting The Success of Video Games
¶

Kevin Rathbun
¶

1. Introduction¶

The Dataset:

Important Notes:

2. Gathering and Transforming the Data¶

3. Exploring the Data: Visualization and Analysis¶

3.1 Game Genre and Success¶

Revenue¶

Playtime¶

Ratings¶

3.2 Game Categories and Success¶

Revenue¶

Playtime¶

Ratings¶

3.3 Game Price and Success¶

How we will evaluate our graphs:¶

Revenue¶

Playtime¶

Ratings¶

3.4 Game Hardware Requirements and Success¶

How we will evaluate the graphs:¶

Revenue¶

Playtime¶

Ratings¶

4. Conclusion¶

Predicting The Success of Video Games¶

Kevin Rathbun¶

1. Introduction¶

The Dataset:

Important Notes:

2. Gathering and Transforming the Data¶

3. Exploring the Data: Visualization and Analysis¶

3.1 Game Genre and Success¶

Revenue¶

Playtime¶

Ratings¶

3.2 Game Categories and Success¶

Revenue¶

Playtime¶

Ratings¶

3.3 Game Price and Success¶

How we will evaluate our graphs:¶

Revenue¶

Playtime¶

Ratings¶

3.4 Game Hardware Requirements and Success¶

How we will evaluate the graphs:¶

Revenue¶

Playtime¶

Ratings¶

4. Conclusion¶

Predicting The Success of Video Games
¶

Kevin Rathbun
¶