Bedouin Bedouin - 17 days ago 5
Python Question

How to use map in Python for reading files?

I have the following Python code, that parses URL from each files from directory, I try to use function

map
to realize mulitiprocessing

import glob, os
import xmltodict
import mysql.connector
from multiprocessing import Pool


def get_xml_paths(folder):

return (os.path.join(folder, f)
for f in os.listdir(folder)
if 'xml' in f)

def openXML(file):

global i
doc = xmltodict.parse(file.read())
for i in range(0, len(doc['urlset']['url'])):

if i > to:
break

## Validation
url = doc['urlset']['url'][i]['loc'];
if "books" in url:
c.execute("INSERT INTO apps (url) VALUES (%s)", [url])
conn.commit()

i = i + 1

if __name__ == '__main__':

files = get_xml_paths("unzip/")

pool = Pool()
pool.map(openXML, files)
pool.close()
pool.join()
c.close()


So, when I run this app, I get error list:

multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\O\AppData\Local\Programs\Python\Python35-32\lib\multiprocessing\pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "C:\Users\O\AppData\Local\Programs\Python\Python35-32\lib\multiprocessing\pool.py", line 44, in mapstar
return list(map(*args))
File "C:\Users\O\PycharmProjects\Grabber\grabber.py", line 28, in openXML
doc = xmltodict.parse(file.read())
AttributeError: 'str' object has no attribute 'read'


How can I fix this? I don't see obvious reasons.

Answer

file in openXML is a string not a file object, therefore you have no read-method in strings. You have to open the file first:

import glob, os
import xmltodict
import mysql.connector
from multiprocessing import Pool

def open_xml(file):
    with open(file) as xml:
        doc = xmltodict.parse(xml.read())
    cursor = conn.cursor()
    for url in doc['urlset']['url']:
        url = url['loc'];
        if "books" in url:
            cursor.execute("INSERT INTO apps (url) VALUES (%s)", [url])
            conn.commit()

if __name__ == '__main__':
    files = glob.glob("unzip/*.xml")
    pool = Pool()
    pool.map(open_xml, files)