dedpo dedpo - 7 months ago 6
Python Question

writing to CSV in python

My csv writer currently does not produced row by row it just jumbles it up. Any help would be great, basically i need csv with the 4 lines in yields sections below in one colulmn.

tweets_df=tweets_df.dropna()

for i in tweets_df.ix[:,0]:
if regex_getter(i) != None:
print(regex_getter(i))


yields

Burlington, VT
Minneapolis, MN
Bloomington, IN
Irvine, CA


with open('Bernie.csv', 'w') as mycsvfile:
for i in tweets_df.ix[:,0]:
if regex_getter(i) != None:
row = regex_getter(i)
writer.writerow([i])




def regex_getter(entry):

txt = entry

re1='((?:[a-z][a-z]+))' # Word 1
re2='(,)' # Any Single Character 1
re3='(\\s+)' # White Space 1
re4='((?:(?:AL)|(?:AK)|(?:AS)|(?:AZ)|(?:AR)|(?:CA)|(?:CO)|(?:CT)|(?:DE)|(?:DC)|(?:FM)|(?:FL)|(?:GA)|(?:GU)|(?:HI)|(?:ID)|(?:IL)|(?:IN)|(?:IA)|(?:KS)|(?:KY)|(?:LA)|(?:ME)|(?:MH)|(?:MD)|(?:MA)|(?:MI)|(?:MN)|(?:MS)|(?:MO)|(?:MT)|(?:NE)|(?:NV)|(?:NH)|(?:NJ)|(?:NM)|(?:NY)|(?:NC)|(?:ND)|(?:MP)|(?:OH)|(?:OK)|(?:OR)|(?:PW)|(?:PA)|(?:PR)|(?:RI)|(?:SC)|(?:SD)|(?:TN)|(?:TX)|(?:UT)|(?:VT)|(?:VI)|(?:VA)|(?:WA)|(?:WV)|(?:WI)|(?:WY)))(?![a-z])' # US State 1

rg = re.compile(re1+re2+re3+re4,re.IGNORECASE|re.DOTALL)
m = rg.search(txt)
if m:
word1=m.group(1)
c1=m.group(2)
ws1=m.group(3)
usstate1=m.group(4)
return str((word1 + c1 +ws1 + usstate1))


What my data looks without the regex method, it basically takes out all data that is not in City, State format. It excluded everything not like Raleigh, NC for example.

for i in tweets_df.ix[:,0]:
print(i)


Indiana, USA
Burlington, VT
United States
Saint Paul - Minneapolis, MN
Inland Valley, The Pass, S. CA
In the Dreamatorium
Nova Scotia;Canada
North Carolina, USA
INTP. West Michigan
Los Angeles, California
Waterbury Connecticut
Right side of the tracks

Answer

I would do it this way:

states =  {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

# sample DF
data = """\
location
Indiana, USA
Burlington, VT
United States
Saint Paul - Minneapolis, MN
Inland Valley, The Pass, S. CA
In the Dreamatorium
Nova Scotia;Canada
North Carolina, USA
INTP. West Michigan
Los Angeles, California
Waterbury Connecticut
Right side of the tracks
"""
df = pd.read_csv(io.StringIO(data), sep=r'\|')

re_states = r'.*,\s*(?:' + '|'.join(states.keys()) + ')'

df.ix[df.location.str.contains(re_states), 'location'].to_csv('filtered.csv', index=False)

Explanation:

In [3]: df
Out[3]:
                          location
0                     Indiana, USA
1                   Burlington, VT
2                    United States
3     Saint Paul - Minneapolis, MN
4   Inland Valley, The Pass, S. CA
5              In the Dreamatorium
6               Nova Scotia;Canada
7              North Carolina, USA
8              INTP. West Michigan
9          Los Angeles, California
10           Waterbury Connecticut
11        Right side of the tracks

generated RegEx:

In [9]: re_states
Out[9]: '.*,\\s*(?:VA|AK|ND|CA|CO|AR|MD|DC|KY|LA|OR|VT|IL|CT|OH|GA|WA|AS|NC|MN|NH|ID|HI|NA|MA|MS|WV|VI|FL|MO|MI|AL|ME|GU|NM|SD|WY|AZ|MP|DE|RI|PA|
NJ|WI|OK|TN|TX|KS|IN|NV|NY|NE|PR|UT|IA|MT|SC)'

Search mask:

In [10]: df.location.str.contains(re_states)
Out[10]:
0     False
1      True
2     False
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
Name: location, dtype: bool

Filtered DF:

In [11]: df.ix[df.location.str.contains(re_states)]
Out[11]:
                       location
1                Burlington, VT
3  Saint Paul - Minneapolis, MN

Now just spool it to CSV:

df.ix[df.location.str.contains(re_states), 'location'].to_csv('d:/temp/filtered.csv', index=False)

filtered.csv:

"Burlington, VT"
"Saint Paul - Minneapolis, MN"
Comments