user3812837 user3812837 - 3 months ago 7
Python Question

Python why time increases 13 times when using list append instead of string concatenation?

I have a script. Please find complete code at code review here

I have removed string concatenation & replaced it with list append as below

sample old code

def comment_line(self, line):
self.line = line
line_to_write = ""
line_to_write += "//"+self.line+"\n" #This is the line that I have changed
self.outputfile.write(line_to_write)


sample new code

def comment_line(self, line):
self.line = line
self.output_list.append("//"+self.line+"\n") #This is the line that I have changed


complete code is at code review here

Then i ran code using below script to find time of execution

import time
start = time.time()

import os
t =os.system('python_file.py input_file.asn')
print('It took', time.time()-start, 'seconds.')


Time taken by old & new script is as below

Old script took 0.019999980926513672 seconds.
>>> ================================ RESTART ================================
>>>
New script took 0.2999999523162842 seconds.


Complete new code

import re
from collections import deque
import sys
import inflection


class Convert(object):
'''To do: add data'''
def __init__(self):
'''To do: add data'''
self.plist = []
self.slist = []
self.tlist = []
self.llist = []
self.lines = []
self.line = None
self.open_braces = []
self.close_braces = []
self.outputfile = None
self.i = None
self.open_brace_list = []
self.close_brace_list = []
self.file_name = None
self.split_character = None
self.length = None
self.enumvariable_flag = None
self.inner_variable_prefix=""
self.output_list=[]

def start_tag(self, split_character, line):
'''To do: add data'''
self.split_character = split_character
self.line = line
self.output_list.append("enum E")
self.inner_variable_prefix = inflection.camelize(inflection.underscore((self.line.split(self.split_character)[0]).replace('-', '_')).lower()).strip()
self.output_list.append(self.inner_variable_prefix)
self.output_list.append("//"+self.line)
self.output_list.append("\n")
self.output_list.append("{\n")
self.enumvariable_flag = True

def end_tag(self,line):
self.line=line
self.output_list.append("};\n")
self.enumvariable_flag = False

def comment_line(self, line):
self.line = line
self.output_list.append("//"+self.line+"\n")

def handle_comment(self, line):
'''To do: add data'''
self.line = line
if (line.strip()).startswith("--")or(re.search(r'(.*)\{(.*)\}(.*)', line)):
self.output_list.append(" ")
self.output_list.append("//"+self.line+"\n")

def handle_inner_element(self, line, index):
'''To do: add data'''
self.line = line
self.index = index
if self.output_list[-1] != " ":
self.output_list.append(" ")
try:
try:
value = (re.findall(r'\d+', self.line.strip().split(' ')[1])[0])
self.output_list.append("e")
self.output_list.append(self.inner_variable_prefix)
self.output_list.append(inflection.camelize((self.line.strip().split(' ')[0]).replace('-', '_')))
self.output_list.append(" = ")
self.output_list.append(value)
if self.index not in self.llist:
self.output_list.append(",")
self.output_list.append("\n")
except Exception as e:
if (self.line.strip().split(' ')[0]).lower() == \
self.line.strip().split(' ')[1].split('-')[0].lower():
self.output_list.append("e")
self.output_list.append(self.inner_variable_prefix)
self.output_list.append(inflection.camelize((
self.line.strip().split(' ')[0].replace('-', '_')).lower()))
if self.index not in self.llist:
self.output_list.append(",")
else:
self.output_list.append("//")
self.output_list.append(self.line)
self.output_list.append("\n")
except Exception as exception:
print(exception)

def generate_lists(self, length, lines):
'''To do: add data'''
self.length = length
self.lines = lines
flag_llist=False
lastl=None
reg1 = r'::=(.*)\n\{'
reg2 = r'{'
reg3 = r'\}'
reg4 = r'(.*)\{(.*)\}(.*)'
for index, line in enumerate(self.lines):

if index < (self.length-1):
val = str(line) + "\n" + str(self.lines[index+1])
else:
val = str(line)
if re.search(reg1, val)and(not re.search(reg4, val)):
self.plist.append(index)
flag_llist=True
else:
val = str(line)
if re.search(reg2, val)and(not re.search(reg4, val)):
if index in self.plist:
pass
else:
self.slist.append(index)
flag_llist=True
if re.search(reg3, val)and(not re.search(reg4, val)):
self.tlist.append(index)
self.llist.append(lastl)
flag_llist=False
elif flag_llist:
try:
value = (re.findall(r'\d+', line.strip().split(' ')[1])[0])
lastl=index
except Exception as e:
pass
try:
if (line.strip().split(' ')[0]).lower() == \
line.strip().split(' ')[1].split('-')[0].lower():
lastl=index
except Exception as e:
pass

return self.plist, self.slist, self.tlist

def add_sub_element(self, open_brace_list, close_brace_list):
'''To do: add data'''
self.open_brace_list = open_brace_list
self.close_brace_list = close_brace_list
self.enumvariable_flag = False
for i in range(1, len(self.open_brace_list)):
for index, line in enumerate(self.lines):
if index == self.open_brace_list[i]-1:
self.start_tag(' ', line)
if (index <= self.close_brace_list[i-1])and\
(index > self.open_brace_list[i])and self.enumvariable_flag:
self.handle_comment(line)
if (self.line.strip()).startswith("}"):
self.end_tag(line)
if self.enumvariable_flag and(not (self.line.strip()).startswith("--"))and\
(not (self.line.strip()).startswith("{")and\
(index <= self.close_brace_list[i-1])and(index > open_brace_list[i])):
self.handle_inner_element(line, index)

def braces_line_no(self, i):
'''To do: add data'''
self.i = i
remaining_slist = [a for a in self.slist if a > self.plist[self.i]]
remaining_tlist = [a for a in self.tlist if a > self.plist[self.i]]
try:
self.open_braces = [b for b in remaining_slist if b < self.plist[self.i+1]]
except Exception as e:
self.open_braces = remaining_slist
try:
self.close_braces = [b for b in remaining_tlist if b < self.plist[self.i+1]]
except Exception as e:
self.close_braces = remaining_tlist
return self.open_braces, self.close_braces

def generate_output(self, file_name):
'''To do: add data'''
self.file_name = file_name
output_file_name = self.file_name.split('.')[0]+".hpp"
self.outputfile = open(output_file_name, 'w')
with open(self.file_name) as f_in:
self.lines = (line.strip() for line in f_in)
self.lines = list(line for line in self.lines if line)
length = len(self.lines)
self.plist, self.slist, self.tlist = self.generate_lists(length, self.lines)
for i in range(len(self.plist)):
self.open_braces, self.close_braces = self.braces_line_no(i)
open_braces_qu = deque(self.open_braces)
for index, line in enumerate(self.lines):
if (not self.enumvariable_flag)and(self.tlist[-1] != self.close_braces[-1]):
if(index > self.close_braces[-1]) and (index < self.slist[self.slist.index(self.open_braces[-1])+1]-1):
self.comment_line(line)
elif self.enumvariable_flag==None and (index < self.plist[0]):
self.comment_line(line)
elif self.close_braces[-1] == self.tlist[-1] and index > self.tlist[-1]:
self.comment_line(line)
if index == self.plist[i]:
self.start_tag('::=', line)
elif len(self.open_braces) == 1 and len(self.close_braces) == 1 and\
self.enumvariable_flag:
self.handle_comment(line)
if (self.line.strip()).startswith("}"):
self.end_tag(line)
if self.enumvariable_flag and(not (line.strip()).startswith("--"))and\
(not (line.strip()).startswith("{")):
self.handle_inner_element(line, index)

elif self.enumvariable_flag and(len(self.open_braces) > 1)and(len(open_braces_qu) > 1):
if self.output_list[-1] != " ":
self.output_list.append(" ")
try:
if index == open_braces_qu[1]-1:
try:
value = (re.findall(r'\d+', line.strip().split(' ')[1])[0])
self.output_list.append("e")
self.output_list.append(self.inner_variable_prefix)
self.output_list.append(inflection.camelize(inflection.underscore(line.strip().split(' ')[0]\
.replace('-', '_')).lower()))
self.output_list.append(" = ")
self.output_list.append(value)
if len(open_braces_qu) > 2:
self.output_list.append(", ")
self.output_list.append("\n")
except Exception as e:
if (line.strip().split(' ')[0]).lower() == line.strip()\
.split(' ')[1].split('-')[0].lower():
self.output_list.append("e")
self.output_list.append(self.inner_variable_prefix)
self.output_list.append(inflection.camelize(inflection.underscore(line.strip().split(' ')[0].replace('-', '_')).lower()))
if len(open_braces_qu) > 2:
self.output_list.append(", ")
else:
self.output_list.append("//")
self.output_list.append(line)
self.output_list.append("\n")
open_braces_qu.popleft()
if len(open_braces_qu) == 1:
self.end_tag(line)
open_braces_qu.popleft()
self.add_sub_element(self.open_braces, self.close_braces)

except Exception as exception:
print(exception)


for data in self.output_list:
self.outputfile.write(data)
self.outputfile.close()

if __name__ == '__main__':
INPUT_FILE_NAME = sys.argv[1]
CON_OBJ = Convert()
CON_OBJ.generate_output(INPUT_FILE_NAME)

Answer

The second example contains a string concatenation ("//"+self.line+"\n") and appending to a list.

That doesn't explain why it's suddenly so much slower; my guess is that the list contains many elements. Appending to long lists can be expensive since Python has to copy the list eventually.

In the original code, you just created a short string and flushed that to a buffer (and eventually to the file system). This operation can be relatively cheap. If you append to a list with millions of elements, Python will eventually run out of space in the underlying data structure and have to copy it into a bigger one. And after adding N more elements, it has to do the same.

In addition to that, your code to measure the timing is not reliable. Background jobs can have a huge influence when you do it this way. Use the timeit module as suggested by cdrake or maybe the shell command time (timeit will be more accurate).

[EDIT] There are three strategies: String concatenation (SC), list append (LA) and streaming to a file (STF).

SC is efficient when you concatenate short strings and don't keep them around for long. SC becomes ever more inefficient as the string becomes longer because for every append, Python has to copy the whole string.

LA is efficient when you need to keep the data around. Lists allocate N slots. As long as you don't need them all, adding to the list is cheap: You just use one of the free slots. Lists become expensive when you run out of slots because then Python has to copy the list. So they are a bit more efficient than SC but eventually, they suffer from the same underlying problem: Append too much and the copy times will kill you.

STF means you open a file and write the data into the file as you produce it. You only keep small amounts of the data in memory. This is efficient for large amounts of output because you avoid the copying of existing data. The drawback is that this is not efficient for small amounts of data because of the overhead.

Conclusion: Know your data structures. There is no structure which works in every case. All of them have advantages and disadvantages.

Comments