citramaillo citramaillo - 7 months ago 6
SQL Question

Warning thrown while trying to import scraped values into MySQL table - using Python and BS4

I'm getting a truncated data warning when I try to import the assigned values into the designated MySQL table.

The values are stored in the SQL table, minus "U.S." and one of the dates that has a massive amount of " " after it.

What can I do to convert the values to make the truncated values more agreeable?

: Warning: Data truncated for column 'last_count' at row 1
cur.execute('INSERT IGNORE INTO RIGCOUNT (area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count) VALUES (\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\")',(area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count))
{'count': u'440', 'area': u'U.S.', 'change_from_prior_count': u'-3', 'last_count': u'15 April\r\n 2016', 'change_from_last_year': u'-514', 'date_of_last_year_count': u'17 April 2015', 'date_of_prior_count': u'8 April 2016'}

import scraperwiki
import requests
from bs4 import BeautifulSoup
import csv
import MySQLdb

#mysql portion
mydb = MySQLdb.connect(host='localhost',
user= '********',
passwd='*******',
db='testdb')
cur = mydb.cursor()

def store (area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count):
cur.execute('INSERT IGNORE INTO RIGCOUNT (area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count) VALUES (\"%s\",$
cur.connection.commit()


base_url = 'http://phx.corporate-ir.net/phoenix.zhtml?c=79687&p=irol-rigcountsoverview'
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")

table = soup.findAll('table')
rows = table[1].findAll("tr")
if len(soup.findAll('tr')) > 0:
rows = rows[1:]
for row in rows:
cells = row.findAll('td')
area = cells[0].get_text()
last_count = cells[1].get_text()
count = cells[2].get_text()
change_from_prior_count = cells[3].get_text()
date_of_prior_count = cells[4].get_text()
change_from_last_year = cells[5].get_text()
date_of_last_year_count = cells[6].get_text()
store(area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count)
data = {
'area': cells[0].get_text(),
'last_count': cells[1].get_text(),
'count': cells[2].get_text(),
'change_from_prior_count': cells[3].get_text(),
'date_of_prior_count': cells[4].get_text(),
'change_from_last_year': cells[5].get_text(),
'date_of_last_year_count': cells[6].get_text(),
}

print data
print '\n'
mydb.close()

Answer

... one of the dates that has a massive amount of " " after it.

You can replace multiple whispaces with just one with a regular expression.

import scraperwiki
import requests
from bs4 import BeautifulSoup
import csv
import MySQLdb
import re

#mysql portion
mydb = MySQLdb.connect(host='localhost',
       user= '******',
       passwd='******',
       db='testdb')
cur = mydb.cursor()


def store (area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count):
    cur.execute('INSERT IGNORE INTO RIGCOUNT (area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count) VALUES (\"%s\",$
    cur.connection.commit()


base_url = 'http://phx.corporate-ir.net/phoenix.zhtml?c=79687&p=irol-rigcountsoverview'
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")

table = soup.findAll('table')
rows = table[1].findAll("tr")
if len(soup.findAll('tr')) > 0:
    rows = rows[1:]

white_space_pattern = re.compile(r"\s+")

for row in rows:
    cells = row.findAll('td')
    cells = [white_space_pattern.sub(' ', cell.get_text()) for cell in cells]

    area = cells[0]
    last_count =  cells[1]
    count = cells[2]
    change_from_prior_count = cells[3]
    date_of_prior_count = cells[4]
    change_from_last_year = cells[5]
    date_of_last_year_count = cells[6]

    store(area, last_count, count, change_from_prior_count, date_of_prior_count, change_from_last_year, date_of_last_year_count)
    data = {
       'area': area,
       'last_count': last_count,
       'count': count,
       'change_from_prior_count': change_from_prior_count,
       'date_of_prior_count': date_of_prior_count,
       'change_from_last_year': change_from_last_year,
       'date_of_last_year_count': date_of_last_year_count,
    }

    print data
    print '\n'
mydb.close()
Comments