# Importing packages
import requests as req
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


gender_patterns = req.get('https://everynoise.com/tview.cgi?source=gender_listening_patterns&sort=poprank&colorthis=true')


soup = BeautifulSoup(gender_patterns.content, 'html.parser')
soup = soup.find("table")


df = pd.read_html(str(soup))[0]
df.head()


df = pd.DataFrame(df.values[1:], columns=df.iloc[0])
df.head()


df = df.drop(['#'], axis=1)


df = df[:-1]
df.head()


df = df[:300]
df.head()


df = df.apply(pd.to_numeric, errors='ignore')


df[df.isnull().values.any(axis=1)]


# To prevent unnecessary rewriting when we add our regression later
def listeners_to_streams():
    fig, ax = plt.subplots()
    plt.scatter(df[['female streams']], df[['female listeners']])
    ax.set_xlabel('Female streams')
    ax.set_ylabel('Female listeners')
    ax.set_xticks(np.linspace(0.0, 1.0, num=11))
    ax.set_yticks(np.linspace(0.0, 1.0, num=11))
    ax.set_ylim(0,1)
    ax.set_title('Proportions of female streams compared to female listeners')

listeners_to_streams()


x_train = df[['female streams']].values
y_train = df[['female listeners']].values

x_train = x_train[x_train[:, 0].argsort()]
y_train = y_train[y_train[:, 0].argsort()]

poly = PolynomialFeatures(3)
x_poly = poly.fit_transform(x_train)

poly_reg = linear_model.LinearRegression()

poly_reg.fit(x_poly, y_train)

listeners_to_streams()
plt.plot(x_train, poly_reg.predict(x_poly), color='red')

[<matplotlib.lines.Line2D at 0x7f9b20a17610>]


poly_reg.score(x_poly, y_train)

0.9799605869317595


most_popular = df[:5]['genre']
most_popular

0              pop
1        dance pop
2    post-teen pop
3              rap
4             rock
Name: genre, dtype: object


fig, ax = plt.subplots()
ax.set_xlabel('Female listeners')
ax.set_ylabel('Female streams')
ax.set_title('Proportions of female artists compared to female listeners')
ax.set_xticks(np.linspace(0.0, 1.0, num=11))
ax.set_yticks(np.linspace(0.0, 1.0, num=11))

# This annotates the 5 most popular genres
for i in range(most_popular.size):
    row_in_question = df[df['genre'] == most_popular[i]]
    x = row_in_question['female listeners']
    y = row_in_question['female streams']
    plt.annotate(most_popular[i], (x, y))

plt.show()


# Plot of female streams vs. ffshare
def streams_ffshare_setup():
    fig, ax = plt.subplots()    
    ax.set_xlabel('Share of streams of women within genre')
    ax.set_ylabel('Share of women listening to women within genre')
    ax.set_xticks(np.linspace(0.0, 1.0, num=11))
    ax.set_yticks(np.linspace(0.0, 1.0, num=11))
    ax.set_title('Share of female streams vs share of female streams from women')

streams_ffshare_setup()
plt.scatter(df[['female streams']], df[['ffshare']])

lreg = linear_model.LinearRegression()
m, b = np.polyfit(df['female streams'], df['ffshare'], 1)
plt.plot(df[['female streams']], m * df[['female streams']] + b, color='red')

plt.show()

# Same plot as before, but only the top 5 genres
streams_ffshare_setup()

for i in range(most_popular.size):
    row_in_question = df[df['genre'] == most_popular[i]]
    x = row_in_question['female streams']
    y = row_in_question['ffshare']
    plt.annotate(most_popular[i], (x, y))
    
plt.show()

# Plot of female streams vs. mfshare
def streams_mfshare_setup():
    fig, ax = plt.subplots()
    ax.set_xlabel('Share of streams of women within genre')
    ax.set_ylabel('Share of men listening to women within genre')
    ax.set_xticks(np.linspace(0.0, 1.0, num=11))
    ax.set_yticks(np.linspace(0.0, 1.0, num=11))
    ax.set_title('Share of female streams vs share of female streams from men')

streams_mfshare_setup()
plt.scatter(df[['female streams']], df[['mfshare']])

lreg = linear_model.LinearRegression()
m, b = np.polyfit(df['female streams'], df['mfshare'], 1)
plt.plot(df[['female streams']], m * df[['female streams']] + b, color='red')

plt.show()

# Same plot as before, but only the top 5 genres
streams_mfshare_setup()

for i in range(most_popular.size):
    row_in_question = df[df['genre'] == most_popular[i]]
    x = row_in_question['female streams']
    y = row_in_question['mfshare']
    plt.annotate(most_popular[i], (x, y))
    
plt.show()


df['mf_overall_diff'] = df['mfshare'] - df['female streams']
df['ff_overall_diff'] = df['ffshare'] - df['female streams']
df.head()


def femstream_mf_overall_setup():
    fig, ax = plt.subplots()
    ax.set_xlabel('Share of streams of women within genre')
    ax.set_ylabel('Share of men listening to women/nb artists more')
    ax.set_xticks(np.linspace(0.0, 1.0, num=11))
    ax.set_ylim(-0.2,0.2)
    ax.set_title('Differences in listening patterns of men from the average listener within a genre')

femstream_mf_overall_setup()
plt.scatter(df[['female streams']], df[['mf_overall_diff']])

m, b = np.polyfit(df['female streams'], df['mf_overall_diff'], 1)
plt.plot(df[['female streams']], m * df[['female streams']] + b, color='red')
plt.show()

# Same plot as before, but only the top 5 genres
femstream_mf_overall_setup()

for i in range(most_popular.size):
    row_in_question = df[df['genre'] == most_popular[i]]
    x = row_in_question['female streams']
    y = row_in_question['mf_overall_diff']
    plt.annotate(most_popular[i], (x, y))
    
plt.show()

def femstream_ff_overall_setup():
    fig, ax = plt.subplots()
    ax.set_xlabel('Share of streams of women within genre')
    ax.set_ylabel('Share of women listening to women/nb artists more')
    ax.set_xticks(np.linspace(0.0, 1.0, num=11))
    ax.set_ylim(-0.2,0.2)
    ax.set_title('Differences in listening patterns of women from the average listener within a genre')

femstream_ff_overall_setup()
plt.scatter(df[['female streams']], df[['ff_overall_diff']])

m, b = np.polyfit(df['female streams'], df['ff_overall_diff'], 1)
plt.plot(df[['female streams']], m * df[['female streams']] + b, color='red')

plt.show()

# Same plot as before, but only the top 5 genres
femstream_ff_overall_setup()

for i in range(most_popular.size):
    row_in_question = df[df['genre'] == most_popular[i]]
    x = row_in_question['female streams']
    y = row_in_question['ff_overall_diff']
    plt.annotate(most_popular[i], (x, y))
    
plt.show()

def mf_ff_overall_setup():
    fig, ax = plt.subplots()
    ax.set_xlabel('Share of streams of women within genre')
    ax.set_ylabel('ffshare minus mfshare')
    ax.set_xticks(np.linspace(0.0, 1.0, num=11))
    ax.set_ylim(-0.3,0.3)
    ax.set_title('Differences in listening patterns between men and women within a genre')

mf_ff_overall_setup()
plt.scatter(df[['female streams']], df[['shareskew']])

m, b = np.polyfit(df['female streams'], df['shareskew'], 1)
plt.plot(df[['female streams']], m * df[['female streams']] + b, color='red')

plt.show()

# Same plot as before, but only the top 5 genres
mf_ff_overall_setup()

for i in range(most_popular.size):
    row_in_question = df[df['genre'] == most_popular[i]]
    x = row_in_question['female streams']
    y = row_in_question['shareskew']
    plt.annotate(most_popular[i], (x, y))
    
plt.show()

	0	1	2	3	4	5	6	7	8
0	#	genre	female listeners	female streams	gender tilt	ffshare	mfshare	shareskew	poprank
1	1	pop	0.617	0.514	0.103	0.548	0.443	0.105	1
2	2	dance pop	0.602	0.608	-0.006	0.646	0.524	0.122	2
3	3	post-teen pop	0.681	0.643	0.038	0.659	0.617	0.042	3
4	4	rap	0.276	0.030	0.246	0.062	0.025	0.037	4

	#	genre	female listeners	female streams	gender tilt	ffshare	mfshare	shareskew	poprank
0	1	pop	0.617	0.514	0.103	0.548	0.443	0.105	1
1	2	dance pop	0.602	0.608	-0.006	0.646	0.524	0.122	2
2	3	post-teen pop	0.681	0.643	0.038	0.659	0.617	0.042	3
3	4	rap	0.276	0.030	0.246	0.062	0.025	0.037	4
4	5	rock	0.316	0.077	0.239	0.105	0.068	0.037	5

	genre	female listeners	female streams	gender tilt	ffshare	mfshare	shareskew	poprank
0	pop	0.617	0.514	0.103	0.548	0.443	0.105	1
1	dance pop	0.602	0.608	-0.006	0.646	0.524	0.122	2
2	post-teen pop	0.681	0.643	0.038	0.659	0.617	0.042	3
3	rap	0.276	0.030	0.246	0.062	0.025	0.037	4
4	rock	0.316	0.077	0.239	0.105	0.068	0.037	5

	genre	female listeners	female streams	gender tilt	ffshare	mfshare	shareskew	poprank
0	pop	0.617	0.514	0.103	0.548	0.443	0.105	1
1	dance pop	0.602	0.608	-0.006	0.646	0.524	0.122	2
2	post-teen pop	0.681	0.643	0.038	0.659	0.617	0.042	3
3	rap	0.276	0.030	0.246	0.062	0.025	0.037	4
4	rock	0.316	0.077	0.239	0.105	0.068	0.037	5

	genre	female listeners	female streams	gender tilt	ffshare	mfshare	shareskew	poprank	mf_overall_diff	ff_overall_diff
0	pop	0.617	0.514	0.103	0.548	0.443	0.105	1	-0.071	0.034
1	dance pop	0.602	0.608	-0.006	0.646	0.524	0.122	2	-0.084	0.038
2	post-teen pop	0.681	0.643	0.038	0.659	0.617	0.042	3	-0.026	0.016
3	rap	0.276	0.030	0.246	0.062	0.025	0.037	4	-0.005	0.032
4	rock	0.316	0.077	0.239	0.105	0.068	0.037	5	-0.009	0.028

How gender might be impacting your music choices

by Joseph Davies

Introduction¶

Data Collection¶

Tidying up¶

What the columns mean¶

Exploratory data analysis and Machine Learning¶

First thoughts¶

Conclusion¶