André André - 5 months ago 17
Python Question

Python: How to use output of "listfiles" to delete/ move/ etc. files

Background:

My target is to find dublicate files in two differen folders (without subfolders). To do that, I use the following Python script:

###Check ob alle Archive noch vorhanden oder ob Daten bei Check gelöscht wurden
def listfiles(path):
files = []
for dirName, subdirList, fileList in os.walk(path):
dir = dirName.replace(path, '')
for fname in fileList:
if fname.endswith("_GIS.7z"):
files.append(os.path.join(dir, fname))
return files

x = listfiles(root)
y = listfiles(backupfolderGIS)

#q = [filename for filename in x if filename not in y]

files_only_in_x = set(x) - set(y)
files_only_in_y = set(y) - set(x)
files_only_in_either = set(x) ^ set(y)
files_in_both = set(x) & set(y)
all_files = set(x) | set(y)

print "Alle Datein:"
print all_files
print " "

print "Nur im Zwischenspeicher:"
print files_only_in_x
print " "

print "Nur im Backupordner:"
print files_only_in_y
print " "

print "Nur einem von beiden Ordnern:"
print files_only_in_either
print " "

print "In beiden Ordnern:"
print files_in_both
print " "


The relevant output variable/ list is
files_in_both
(folders); it shows me the dublicates; if I use
print
, it looks like
set(['NameoftheProject_GIS.7z', 'NameofanotherProject_GIS.7z'])
.

Question:

How can I use this output/ information (of dublicate files in directories) to delete/ move them? Here for example the files
NameoftheProject_GIS.7z
and
NameofanotherProject_GIS.7z
in folder
backupfolderGIS
/ list
files_in_both
.

Answer

os.walk recursively checks all folders and subfolders starting from the root dir you pass, you want to check two different folders (without subfolders) so just search each folder with glob, if you want to move you can use shutil.move:

from glob import iglob
from os import path
from shutil import move

pt1, pt2 = "/path_1", "path_2"
dupe = set(map(path.basename, iglob("/path_1./*_GIS.7z"))).intersection(map(path.basename, iglob("/path_2./*_GIS.7z")))

for fle in dupe:
    # move(src, dest)
    move(path.join(pt1, fle), "wherever")

Or to delete use os.remove:

for fle in dupe:
    os.remove(path.join(pt1, fle))

If you want to move/delete the file from pt2 then pass that to path.join in place of pt1.

You could also use str.endwith with os.listdir:

dupe = set(fname for fname in os.listdir(pt1) if fname.endswith("_GIS.7z")).intersection(fname for fname in os.listdir(pt2) if fname.endswith("_GIS.7z"))

To avoid repeating you can put it in a function:

from shutil import move
from os import path, listdir
def listfiles(path, end):
    return set(fname for fname in listdir(path) if fname.endswith(end))

for fle in listfiles(pt1,"_GIS.7z").intersection(listfiles(pt2, "_GIS.7z")):
    move(path.join(t1, fle), "wherever")

Now if you did want to check all folders for files with the same basename and so something for dupe name, you would need to keep a record of the full paths, you can group all common files by basename using a defaultdict:

from os import path, walk
from collections import defaultdict

def listfiles(pth, end):
    files = defaultdict(list)
    for dirName, subdirList, fileList in walk(pth):
        for fname in fileList:
            if fname.endswith(end):
                files[fname].append(path.join(dirName, fname))
    return files

You will get a dict where the keys are the basenames and the values are lists of files with the full path to each, any list with more than one vaulue means you have at least two files with the same name but you should remember have the same basename does not mean the files are actually the same.

Comments