I am familiar with scraping data in this format.
{"data":[{"assists":0,"assistsPerGame":0.0000,"evAssists":0,"evPoints":0,"gamesPlayed":1,"goals":0,"penaltyMinutes":0,"playerBirthCity":"Windsor","playerBirthCountry":"CAN","playerBirthDate":"1996-02-07",
import csv
import requests
outfile = open("NHL_Recent.csv","a",newline='')
writer = csv.writer(outfile)
writer.writerow(["Player","Pos","GP","G","A","P","+/-","PIM","PPG","PPP","SHG","SHP","GWG","OTG","S","S%","TOI","Shifts/PG","FOW%"])
req = requests.get('http://www.nhl.com/stats/rest/skaters?isAggregate=true&reportType=basic&isGame=true&reportName=skatersummary&sort=[{%22property%22:%22shots%22,%22direction%22:%22DESC%22}]&cayenneExp=gameDate%3E=%222017-11-4%22%20and%20gameDate%3C=%222017-11-10%22%20and%20gameTypeId=2')
data = req.json()['data']
for item in data:
Player = item['playerName']
Pos = item['playerPositionCode']
GP = item['gamesPlayed']
"totalItems" : 600,
"totalEvents" : 0,
"totalGames" : 600,
"totalMatches" : 0,
"wait" : 10,
"dates" : [ {
"date" : "2017-10-04",
"totalItems" : 4,
"totalEvents" : 0,
"totalGames" : 4,
"totalMatches" : 0,
"games" : [ {
"gamePk" : 2017020001,
"link" : "/api/v1/game/2017020001/feed/live",
"gameType" : "R",
"season" : "20172018",
"gameDate" : "2017-10-04T23:00:00Z",
"status" : {
"abstractGameState" : "Final",
"codedGameState" : "7",
"detailedState" : "Final",
"statusCode" : "7",
"startTimeTBD" : false
},
"teams" : {
"away" : {
"leagueRecord" : {
"wins" : 1,
"losses" : 0,
"ot" : 0,
"type" : "league"
},
"score" : 7,
"team" : {
"id" : 10,
"name" : "Toronto Maple Leafs",
"link" : "/api/v1/teams/10",
"venue" : {
"name" : "Air Canada Centre",
"link" : "/api/v1/venues/null",
"city" : "Toronto",
"timeZone" : {
"id" : "America/Toronto",
"offset" : -5,
"tz" : "EST"
}
},
"abbreviation" : "TOR",
"teamName" : "Maple Leafs",
"locationName" : "Toronto",
"firstYearOfPlay" : "1926",
"division" : {
"id" : 17,
"name" : "Atlantic",
"link" : "/api/v1/divisions/17"
},
"conference" : {
"id" : 6,
"name" : "Eastern",
"link" : "/api/v1/conferences/6"
},
"franchise" : {
"franchiseId" : 5,
"teamName" : "Maple Leafs",
"link" : "/api/v1/franchises/5
import csv
import requests
import os
outfile = open("NHL DIF JSON.csv","a",newline='')
writer = csv.writer(outfile)
writer.writerow(["Date","Game","gamep"])
req = requests.get('https://statsapi.web.nhl.com/api/v1/schedule?startDate=2017-10-04&endDate=2018-04-30&expand=schedule.teams,schedule.linescore,schedule.broadcasts.all,schedule.ticket,schedule.game.content.media.epg,schedule.radioBroadcasts,schedule.metadata,schedule.game.seriesSummary,seriesSummary.series&leaderCategories=&leaderGameTypes=R&site=en_nhl&teamId=&gameType=&timecode=')
data = req.json()['dates']
for item in data:
Date = item['date']
##for item in games:
Game = item['0']
gamep = item['gamePk']
print(Date,Game)
writer.writerow([Date,Game,gamep])
outfile.close()
os.system("taskkill /f /im pythonw.exe")
It's normal json data, just a bit complicated. You can get the date from data['dates'][i]['date']
. For the teams, score, etc you have to iterate over data['dates'][i]['games']
.
req = requests.get('https://statsapi.web.nhl.com/api/v1/schedule?startDate=2017-10-04&endDate=2018-04-30&expand=schedule.teams,schedule.linescore,schedule.broadcasts.all,schedule.ticket,schedule.game.content.media.epg,schedule.radioBroadcasts,schedule.metadata,schedule.game.seriesSummary,seriesSummary.series&leaderCategories=&leaderGameTypes=R&site=en_nhl&teamId=&gameType=&timecode=')
data = req.json()
my_data =[]
for item in data['dates']:
date = item['date']
games = item['games']
for game in games:
gamePk = game['gamePk']
gameDate = game['gameDate']
team_away, team_home = game['teams']['away'], game['teams']['home']
team_away_score = team_away['score']
team_home_score = team_home['score']
team_away_name = team_away['team']['name']
team_home_name = team_home['team']['name']
my_data.append([date, gamePk, gameDate, team_away_name, team_home_name, team_away_score, team_home_score])
headers = ["Date","Game","gamep","gameDate","team_away_name","team_home_name","team_away_score","team_home_score"]
with open("my_file.csv", "a", newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(my_data)
As for your last question, you can get the 'pk' from data['gameData']['game']['pk']
. The player, event, triCode and coordinates values are a little harder to get because some items don't have 'players' and 'team' keys, or the 'coordinates' dict is empty.
In this case the dict.get
method can be helpful because it will return None
(or you can set a default value) if you try to access a non-existent key.
Still you have to design your code according to the structure of the json data, example:
req = requests.get('https://statsapi.web.nhl.com/api/v1/game/2017020001/feed/live?site=en_nhl')
data = req.json()
my_data = []
pk = data['gameData']['game']['pk']
for item in data['liveData']['plays']['allPlays']:
players = item.get('players')
if players:
player_a = players[0]['player']['fullName'] if len(players) > 0 else None
player_b = players[1]['player']['fullName'] if len(players) > 1 else None
else:
player_a, player_b = None, None
event = item['result']['event']
triCode = item.get('team', {}).get('triCode')
coordinates_x, coordinates_y = item['coordinates'].get('x'), item['coordinates'].get('y')
my_data.append([pk, player_a, player_b, event, triCode, coordinates_x, coordinates_y])
for row in my_data:
print(row)