For this investigation, I chose to look into the TMDb movie data set. This data set contains movie information such as cast and genre, and also financial information regarding the movie's budget and revenue. It also includes adjusted budget and revenue values in terms of 2010 dollars, accounting for inflation over time. These columns are called 'budget_adj' and 'revenue_adj' respectively.
The questions that I plan to explore will revolve around traits that relate to the revenue of the movies, particulary the adjusted revenue in terms of 2010 dollars since movies across multiple release years will be explored.
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
# Change floats from displaying in scientific notation
pd.options.display.float_format = '{:20,.2f}'.format
df = pd.read_csv('tmdb-movies.csv')
df.head()
df.describe()
df.info()
df[df.duplicated() == True].shape
df.drop_duplicates(inplace=True)
df[df.duplicated() == True].shape
df[df['genres'].isnull()].shape
df = df.dropna(subset = ['genres'])
df[df['genres'].isnull()].shape
df.query("(budget_adj == 0) | (revenue_adj == 0)").shape
df = df[df['budget_adj'] > 0]
df = df[df['revenue_adj'] > 0]
df.query("(budget_adj == 0) | (revenue_adj == 0)").shape
ax = df.plot(x='budget_adj', y='revenue_adj', kind='scatter', title='Adjusted Budget and Adjusted Revenue',figsize=(16,8))
ax.set_xticklabels(['{:,}'.format(int(x)) for x in ax.get_xticks().tolist()])
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()]);
ax = df[df['budget_adj'] <= 100000000].plot(x='budget_adj', y='revenue_adj', kind='scatter', title='Adjusted Budget and Adjusted Revenue (Adjusted Budget <= 100,000,000)', figsize=(16,8))
ax.set_xticklabels(['{:,}'.format(int(x)) for x in ax.get_xticks().tolist()])
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()]);
ax = df[df['budget_adj'] > 100000000].plot(x='budget_adj', y='revenue_adj', kind='scatter', title='Adjusted Budget and Adjusted Revenue (Adjusted Budget > 100,000,000)', figsize=(16,8))
ax.set_xticklabels(['{:,}'.format(int(x)) for x in ax.get_xticks().tolist()])
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()]);
Specifically, are there certain genres that tend to generate more revenue than others?
genres_set = set()
for genres in df['genres'].iteritems():
for genre in genres[1].split('|'):
genres_set.add(genre)
genres_set = sorted(genres_set)
genres_set
for genre in genres_set:
df[genre] = df['genres'].str.contains(genre)
# Adjust the column slice so we can see some of the added individual genre columns
df.iloc[:, 13:].head()
# This dictionary will be used to create a new data frame
genre_rev_adj_means_dict = {
'mean_revenue_adj': {}
}
# For each genre and associated mean adjusted revenue, add a key/value to the 'mean_revenue_adj' column
for genre in genres_set:
mean_rev_adj = df[df[genre]]['revenue_adj'].mean()
genre_rev_adj_means_dict['mean_revenue_adj'][genre] = mean_rev_adj
# Create and sort the data frame
df_mean_rev_adj_genre = pd.DataFrame.from_dict(genre_rev_adj_means_dict)
df_mean_rev_adj_genre.sort_values(by=['mean_revenue_adj'], inplace=True)
df_mean_rev_adj_genre
ax = df_mean_rev_adj_genre.plot(title='Average Adjusted Revenue By Genre', kind='bar', figsize=(16,8), legend=False)
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()]);
Specifically, is there a trend of revenue increasing or decreasing over time?
release_year_set = sorted(set(df['release_year']))
release_year_set
# This dictionary will be used to create a new data frame
release_year_rev_adj_means_dict = {
'mean_revenue_adj': {}
}
# For each release year and associated mean adjusted revenue, add a key/value to the 'mean_revenue_adj' column
for year in release_year_set:
mean_rev_adj = df[df['release_year'] == year]['revenue_adj'].mean()
release_year_rev_adj_means_dict['mean_revenue_adj'][year] = mean_rev_adj
# Create the data frame
df_mean_rev_adj_release_year = pd.DataFrame.from_dict(release_year_rev_adj_means_dict)
df_mean_rev_adj_release_year
ax = df_mean_rev_adj_release_year.plot(title='Average Adjusted Revenue By Release Year', kind='bar', figsize=(16,8), legend=False)
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()]);