SXC88 SXC88 - 8 days ago 5
Python Question

python threading.timer set time limit when program runs out of time

I have some questions related to setting the maximum running time of a function in Python. In fact, I would like to use

pdfminer
to convert the
.pdf
files to
.txt
.

The problem is that very often, some files are not possible to decode and take extremely long time. So I want to set
threading.Timer()
to limit the conversion time for each file to 5 seconds. In addition, I run under windows so I cannot use the
signal
module for this.

I succeeded in running the conversion code with
pdfminer.convert_pdf_to_txt()
(in my code it is "
c
"), but I am not sure that the in the following code,
threading.Timer()
works. (I don't think it properly constrains the time for each processing)

In summary, I want to:


  1. Convert the pdf to txt

  2. Time limit for each conversion is 5 sec, if it runs out of time, throw an exception and save an empty file

  3. Save all the txt files under the same folder

  4. If there are any exceptions/errors, still save the file but with empty content.



Here is the current code:

import converter as c
import os
import timeit
import time
import threading
import thread

yourpath = 'D:/hh/'

def iftimesout():
print("no")

with open("D:/f/"+g+"&"+t+"&"+name+".txt", mode="w") as newfile:
newfile.write("")


for root, dirs, files in os.walk(yourpath, topdown=False):
for name in files:
try:
timer = threading.Timer(5.0,iftimesout)
timer.start()
t=os.path.split(os.path.dirname(os.path.join(root, name)))[1]
a=str(os.path.split(os.path.dirname(os.path.join(root, name)))[0])
g=str(a.split("\\")[1])

with open("D:/f/"+g+"&"+t+"&"+name+".txt", mode="w") as newfile:
newfile.write(c.convert_pdf_to_txt(os.path.join(root, name)))
print("yes")

timer.cancel()

except KeyboardInterrupt:
raise

except:
for name in files:
t=os.path.split(os.path.dirname(os.path.join(root, name)))[1]
a=str(os.path.split(os.path.dirname(os.path.join(root, name)))[0])

g=str(a.split("\\")[1])
with open("D:/f/"+g+"&"+t+"&"+name+".txt", mode="w") as newfile:
newfile.write("")

Answer

Check following code and let me know in case of any issues. Also let me know whether you still want to use force termination feature (KeyboardInterruption)

path_to_pdf = "C:\\Path\\To\\Main\\PDFs" # No "\\" at the end of path!
path_to_text = "C:\\Path\\To\\Save\\Text\\" # There is "\\" at the end of path!
TIMEOUT = 5  # seconds
TIME_TO_CHECK = 1  # seconds


# Save PDF content into text file or save empty file in case of conversion timeout
def convert(path_to, my_pdf):
    my_txt = text_file_name(my_pdf)
    with open(my_txt, "w") as my_text_file:
         try:
              my_text_file.write(convert_pdf_to_txt(path_to + '\\' + my_pdf))
         except:
              print "Error. %s file wasn't converted" % my_pdf


# Convert file_name.pdf from PDF folder to file_name.text in Text folder
def text_file_name(pdf_file):
    return path_to_text + (pdf_file.split('.')[0]+ ".txt")


if __name__ == "__main__":
    # for each pdf file in PDF folder
    for root, dirs, files in os.walk(path_to_pdf, topdown=False):
        for my_file in files:
            count = 0
            p = Process(target=convert, args=(root, my_file,))
            p.start()
            # some delay to be sure that text file created
            while not os.path.isfile(text_file_name(my_file)):
                time.sleep(0.001)
            while True:
                # if not run out of $TIMEOUT and file still empty: wait for $TIME_TO_CHECK,
                # else: close file and start new iteration
                if count < TIMEOUT and os.stat(text_file_name(my_file)).st_size == 0:
                    count += TIME_TO_CHECK
                    time.sleep(TIME_TO_CHECK)
                else:
                    p.terminate()
                    break
Comments