python_scrape_team_salary
In [1]:
import urllib2
from bs4 import BeautifulSoup
import pandas as pd
In [2]:
salary_df = pd.DataFrame([],columns = ['year','team','salary'])
In [3]:
year = 2016
team = 'GSW'
response = urllib2.urlopen('http://www.basketball-reference.com/teams/{}/{}.html'.format(team,year))
html = response.read()
In [4]:
salary_df
Out[4]:
year team salary
In [6]:
soup = BeautifulSoup(html,'html.parser')
In [7]:
team_stats=soup.find("table",id="salaries")
In [8]:
team_stats
Out[8]:
<table class=" stats_table" id="salaries">
<colgroup><col><col><col></col></col></col></colgroup>
<thead>
<tr class="">
<th align="right" class="ranker sort_default_asc show_partial_when_sorting" data-stat="ranker" tip="Rank">Rk</th>
<th align="left" class=" sort_default_asc" data-stat="player">Player</th>
<th align="right" class="" data-stat="salary">Salary</th>
</tr>
</thead>
<tbody>
<tr class="">
<td align="right" csk="1">1</td>
<td align="left"><a href="/players/t/thompkl01.html">Klay Thompson</a></td>
<td align="right" csk="15500000">$15,500,000</td>
</tr>
<tr class="">
<td align="right" csk="2">2</td>
<td align="left"><a href="/players/g/greendr01.html">Draymond Green</a></td>
<td align="right" csk="14300000">$14,300,000</td>
</tr>
<tr class="">
<td align="right" csk="3">3</td>
<td align="left"><a href="/players/b/bogutan01.html">Andrew Bogut</a></td>
<td align="right" csk="12000000">$12,000,000</td>
</tr>
<tr class="">
<td align="right" csk="4">4</td>
<td align="left"><a href="/players/i/iguodan01.html">Andre Iguodala</a></td>
<td align="right" csk="11710456">$11,710,456</td>
</tr>
<tr class="">
<td align="right" csk="5">5</td>
<td align="left"><a href="/players/c/curryst01.html">Stephen Curry</a></td>
<td align="right" csk="11370786">$11,370,786</td>
</tr>
<tr class="">
<td align="right" csk="6">6</td>
<td align="left"><a href="/players/t/thompja02.html">Jason Thompson</a></td>
<td align="right" csk="6431250">$6,431,250</td>
</tr>
<tr class="">
<td align="right" csk="7">7</td>
<td align="left"><a href="/players/l/livinsh01.html">Shaun Livingston</a></td>
<td align="right" csk="5543725">$5,543,725</td>
</tr>
<tr class="">
<td align="right" csk="8">8</td>
<td align="left"><a href="/players/b/barneha02.html">Harrison Barnes</a></td>
<td align="right" csk="3873398">$3,873,398</td>
</tr>
<tr class="">
<td align="right" csk="9">9</td>
<td align="left"><a href="/players/s/speigma01.html">Marreese Speights</a></td>
<td align="right" csk="3815000">$3,815,000</td>
</tr>
<tr class="">
<td align="right" csk="10">10</td>
<td align="left"><a href="/players/b/barbole01.html">Leandro Barbosa</a></td>
<td align="right" csk="2500000">$2,500,000</td>
</tr>
<tr class="">
<td align="right" csk="11">11</td>
<td align="left"><a href="/players/e/ezelife01.html">Festus Ezeli</a></td>
<td align="right" csk="2008748">$2,008,748</td>
</tr>
<tr class="">
<td align="right" csk="12">12</td>
<td align="left"><a href="/players/r/rushbr01.html">Brandon Rush</a></td>
<td align="right" csk="1270964">$1,270,964</td>
</tr>
<tr class="">
<td align="right" csk="13">13</td>
<td align="left"><a href="/players/l/looneke01.html">Kevon Looney</a></td>
<td align="right" csk="1131960">$1,131,960</td>
</tr>
<tr class="">
<td align="right" csk="14">14</td>
<td align="left"><a href="/players/c/clarkia01.html">Ian Clark</a></td>
<td align="right" csk="947276">$947,276</td>
</tr>
<tr class="">
<td align="right" csk="15">15</td>
<td align="left"><a href="/players/m/mcadoja01.html">James Michael McAdoo</a></td>
<td align="right" csk="845059">$845,059</td>
</tr>
</tbody>
</table>
In [9]:
data = team_stats.find_all("tr")

new_data = []

for i in data:
    new_data.append(i.text)
new_data
Out[9]:
[u'\nRk\nPlayer\nSalary\n',
 u'\n1\nKlay Thompson\n$15,500,000\n',
 u'\n2\nDraymond Green\n$14,300,000\n',
 u'\n3\nAndrew Bogut\n$12,000,000\n',
 u'\n4\nAndre Iguodala\n$11,710,456\n',
 u'\n5\nStephen Curry\n$11,370,786\n',
 u'\n6\nJason Thompson\n$6,431,250\n',
 u'\n7\nShaun Livingston\n$5,543,725\n',
 u'\n8\nHarrison Barnes\n$3,873,398\n',
 u'\n9\nMarreese Speights\n$3,815,000\n',
 u'\n10\nLeandro Barbosa\n$2,500,000\n',
 u'\n11\nFestus Ezeli\n$2,008,748\n',
 u'\n12\nBrandon Rush\n$1,270,964\n',
 u'\n13\nKevon Looney\n$1,131,960\n',
 u'\n14\nIan Clark\n$947,276\n',
 u'\n15\nJames Michael McAdoo\n$845,059\n']
In [10]:
new_data2 = []
for i in new_data:
    new_data2.append((i.replace('\n',',')).split(',', 3))
new_data2
new_data3 = [ x[1:] for x in new_data2]
In [11]:
new_data3
Out[11]:
[[u'Rk', u'Player', u'Salary,'],
 [u'1', u'Klay Thompson', u'$15,500,000,'],
 [u'2', u'Draymond Green', u'$14,300,000,'],
 [u'3', u'Andrew Bogut', u'$12,000,000,'],
 [u'4', u'Andre Iguodala', u'$11,710,456,'],
 [u'5', u'Stephen Curry', u'$11,370,786,'],
 [u'6', u'Jason Thompson', u'$6,431,250,'],
 [u'7', u'Shaun Livingston', u'$5,543,725,'],
 [u'8', u'Harrison Barnes', u'$3,873,398,'],
 [u'9', u'Marreese Speights', u'$3,815,000,'],
 [u'10', u'Leandro Barbosa', u'$2,500,000,'],
 [u'11', u'Festus Ezeli', u'$2,008,748,'],
 [u'12', u'Brandon Rush', u'$1,270,964,'],
 [u'13', u'Kevon Looney', u'$1,131,960,'],
 [u'14', u'Ian Clark', u'$947,276,'],
 [u'15', u'James Michael McAdoo', u'$845,059,']]
In [12]:
new_data4 = map(lambda x: [x[0],x[1],(x[2].replace(',','').replace('$',''))], new_data3)
new_data4
Out[12]:
[[u'Rk', u'Player', u'Salary'],
 [u'1', u'Klay Thompson', u'15500000'],
 [u'2', u'Draymond Green', u'14300000'],
 [u'3', u'Andrew Bogut', u'12000000'],
 [u'4', u'Andre Iguodala', u'11710456'],
 [u'5', u'Stephen Curry', u'11370786'],
 [u'6', u'Jason Thompson', u'6431250'],
 [u'7', u'Shaun Livingston', u'5543725'],
 [u'8', u'Harrison Barnes', u'3873398'],
 [u'9', u'Marreese Speights', u'3815000'],
 [u'10', u'Leandro Barbosa', u'2500000'],
 [u'11', u'Festus Ezeli', u'2008748'],
 [u'12', u'Brandon Rush', u'1270964'],
 [u'13', u'Kevon Looney', u'1131960'],
 [u'14', u'Ian Clark', u'947276'],
 [u'15', u'James Michael McAdoo', u'845059']]
In [13]:
df = pd.DataFrame(new_data4,columns=new_data4[0])
df = df.drop(0,axis=0)
df = df.drop('Rk',axis=1)
df
Out[13]:
Player Salary
1 Klay Thompson 15500000
2 Draymond Green 14300000
3 Andrew Bogut 12000000
4 Andre Iguodala 11710456
5 Stephen Curry 11370786
6 Jason Thompson 6431250
7 Shaun Livingston 5543725
8 Harrison Barnes 3873398
9 Marreese Speights 3815000
10 Leandro Barbosa 2500000
11 Festus Ezeli 2008748
12 Brandon Rush 1270964
13 Kevon Looney 1131960
14 Ian Clark 947276
15 James Michael McAdoo 845059
In [14]:
tot_salary = df['Salary'].astype('float32').sum()
current_entry = pd.DataFrame([[year,team,tot_salary]],columns=['year','team','salary'])
current_entry
Out[14]:
year team salary
0 2016 GSW 93248622
In [15]:
salary_df = pd.concat([salary_df,current_entry])
salary_df
Out[15]:
year team salary
0 2016 GSW 93248622
In [ ]:
salary_df.to_csv('{}_salary.csv'.format(team))