Kyle Kramer Kyle Kramer - 2 years ago 211
Python Question

Ignoring inner header rows in pandas dataframe

I am scraping multiple tables from the web that are exactly like this one (the big batting gamelogs table) and I need the dataframe to ignore the inner header rows that start with the month of the season.

Here is my script so far:

from bs4 import BeautifulSoup
import pandas as pd
import csv
import urllib2

def stir_the_soup():
player_links = open('player_links.txt', 'r')
player_ID_nums = open('player_ID_nums.txt', 'r')
id_nums = [x.rstrip('\n') for x in player_ID_nums]
idx = 0
for url in player_links:
#open the url and create bs object
player_link = urllib2.urlopen(url)
bs = BeautifulSoup(player_link, 'html5lib')

#identify which table is needed
table_id = ""
if url[-12] == 'b':
table_id = "batting"
elif url[-12] == 'p':
table_id = "pitching"

#find the table and create dataframe
table = str(bs.find('table', {'id' : (table_id + '_gamelogs')}))

df = pd.read_html(table, header=0)
df2 = df[0]
df2 = df2[df2.PA != 'PA']

#for the name of the file and file path
file_path = '/Users/kramerbaseball/Desktop/MLB_Web_Scraping_Program/game_logs_non_concussed/'
name_of_file = str(id_nums[idx])

df2.to_csv(path_or_buf=(file_path + name_of_file + '.csv'), sep=',', encoding='utf-8')
idx += 1

if __name__ == "__main__":

I tried taking the dataframe and ignoring the rows where PA == PA or HR == HR but it will not delete the rows. Any help is appreciated

Answer Source

Notice that in some inner headers columns values are constant. This will drop intermediate headers from your df:

df3 = df2[df2['Gtm']!='Date']
