## Testing existence of files `exists` — true if at least one argument exists ``` #!/usr/bin/perl for(@ARGV) { -e && exit 0; } exit 1; ``` `allexist` — false if at least one argument does not exist ``` #!/usr/bin/perl for(@ARGV) { ! -e && exit 1; } exit 0; ``` `filterstat` — `filter(stat,argv[1:])` ``` #!/usr/bin/perl for(@ARGV) { -e && print "$_\n"; } ``` ## Power rename aka file search and replace `filesr` — string replace in filenames ``` #!/usr/bin/env python3 import sys,os import uuid,os.path args = sys.argv[1:] overwrite = False if os.getenv("OVERWRITE") in ["y","Y"]: overwrite = True forreal = True if os.getenv("DRYRUN") is not None and os.getenv("DRYRUN") not in ["n","N"]: forreal = False elif os.getenv("D") is not None and os.getenv("D") not in ["n","N"]: forreal = False if len(args) < 3: print("filesr searchpat replpat ") sys.exit(1) s,r = tuple(args[:2]) files = args[2:] print(f"""filesr search: {s} replace with: {r}""") def rn(x,y): if forreal: if os.path.exists(y): print(f"{y} exists") if overwrite: try: print(f"Overwritten {y}") os.rename(x,y) except IsADirectoryError: print(f"{y} is a directory") else: os.rename(x,y) while True: t = str(uuid.uuid4()) if not os.path.exists(t): break print("Temp name {}".format(t)) for x in files: y = x.replace(s,r) if x == y: pass elif os.path.exists(y): print("{} already exists".format(y)) else: xa = x.lower() ya = y.lower() if xa == ya: print("Capitalisation issue") print(f"Using temp name {t}: {x} --> {y}") rn(x,t) rn(t,y) else: print(f"Rename {x} --> {y}") rn(x,y) ``` `filesrx` — regex replace in filenames ``` #!/usr/bin/env python3 import sys,os,re import uuid,os.path args = sys.argv[1:] overwrite = False verbose = False if os.getenv("OVERWRITE") in ["y","Y"]: overwrite = True if os.getenv("VERBOSE") in ["y","Y"]: verbose = True forreal = True if os.getenv("DRYRUN") is not None and os.getenv("DRYRUN") not in ["n","N"]: forreal = False elif os.getenv("D") is not None and os.getenv("D") not in ["n","N"]: forreal = False if len(args) < 3: print("filesrx searchpat replpat ") print(" searchpad is regex") if len(args) == 2: print("(did you forget to specify files)") sys.exit(1) s,r = tuple(args[:2]) files = args[2:] print(f"""filesrx regex: {s} replace with: {r}""") sr = re.compile(s) def rn(x,y): if x == y: print(f"No change in {x}") return if forreal: if os.path.exists(y): print(f"{y} exists") if overwrite: try: print(f"Overwritten {y}") os.rename(x,y) except IsADirectoryError: print(f"{y} is a directory") else: os.rename(x,y) while True: t = str(uuid.uuid4()) if not os.path.exists(t): break print("Temp name {}".format(t)) for x in files: try: m = sr.search(x) if m is None and verbose: print(f"File {x} does not match {s}") y = sr.sub(r,x) except Exception as e: print(f"Regex {s} failed for file {x}") print(e) continue if x == y: pass elif os.path.exists(y): print(f"{y} already exists") else: xa = x.lower() ya = y.lower() if xa == ya: print("Capitalisation issue") print(f"Using temp name {t}: {x} --> {y}") rn(x,t) rn(t,y) else: print(f"Rename {x} --> {y}") rn(x,y) ``` ## Finding stuff `fdup` — find duplicate files, first finding candidates by size, then by hashing the first 64k, then by the first 1M, then all the file (if desired – usually hashing the first 64k is enough to whittle things down to a small enough set to do by hand) ``` #!/usr/bin/env python3 import sys, os, os.path import hashlib from collections import defaultdict quick = False def doprint(*xs,**kw): if "file" in kw: f = kw['file'] else: f = sys.stdout print(*xs,**kw) f.flush() def hash_first(filename,chunk_size=64*1024): doprint(f"Hashing first {chunk_size} of {filename}") with open(filename,"rb") as f: bytes = f.read(chunk_size) hash = hashlib.sha256(bytes).hexdigest() return hash def hash_all(filename,chunk_size=1024*1024): doprint(f"Hasing all of {filename}") sz = os.path.getsize(filename) x = 0 sha = hashlib.sha256() i = 0 with open(filename,"rb") as f: while byte_block := f.read(chunk_size): doprint(".",end="") sha.update(byte_block) x += len(byte_block) i += 1 if i >= 30: pc = 100*(x/sz) doprint(f" {pc:0.3f}%") i = 0 doprint() return sha.hexdigest() #roots = sys.argv[1:] roots = [] for x in sys.argv[1:]: if x == "-q": quick = True else: roots.append(x) class T: def __init__(self,t=10): self.t = t def __call__(self): self.t -= 1 if self.t <= 0: doprint(f"Exiting") sys.exit(0) # t = T(1000) # pass 1: compile by_size doprint("Finding files") by_size = defaultdict(list) for root in roots: for rt, dirs, files in os.walk(root): for f in files: doprint(".",end="") path = os.path.join(rt,f) sz = os.path.getsize(path) by_size[sz].append(path) doprint("Done finding files") candidates = [] for sz,fs in by_size.items(): if len(fs) > 1: candidates += fs doprint(f"{len(candidates)} candidates by size") if len(candidates) == 0: exit(0) # pass 2: compile by_hash64k by_hash64k = defaultdict(list) for c in candidates: h = hash_first(c,64*1024) by_hash64k[h].append(c) candidates = [] for h,fs in by_hash64k.items(): if len(fs) > 1: candidates += fs doprint(f"{len(candidates)} candidates by hash 64k") if len(candidates) == 0: exit(0) # pass 3: compilie by_hash1m by_hash1m = defaultdict(list) for c in candidates: h = hash_first(c,1024*1024) by_hash1m[h].append(c) candidates = [] for h,fs in by_hash1m.items(): if len(fs) > 1: candidates += fs doprint(f"{len(candidates)} candidates by hash 1M") if len(candidates) == 0: exit(0) if quick: dups = False for h,fs in by_hash1m.items(): if len(fs) > 1: if not dups: dups = True doprint(f"Dups:\n=====\n\n") doprint(f"hash {h}:") for f in fs: doprint(f" {f}") else: # pass 4: compile by hashall by_hashall = defaultdict(list) dups = False for c in candidates: h = hash_all(c) by_hashall[h].append(c) for h,fs in by_hashall.items(): if len(fs) > 1: if not dups: dups = True doprint(f"Dups:\n=====\n\n") doprint(f"hash {h}:") for f in fs: doprint(f" {f}") ``` `find_empty_files` ``` #!/bin/bash A="$1" shift find "$A" -size 0 "$@" ``` ## FS related `fsof` — find the filesystem containing a file (there is probably a far more elegant way to do this, especially if you know Linux's C apis better than I do) ``` #!/bin/bash A="$1" if [ -e "$A" ]; then df "$A" | awk '/dev/ { print $1 }' | xargs lsblk -f | grep -v NAME | awk '{ print $2 }' elif [ -z "$A" ]; then echo "fsof " else echo "File or directory '$A' does not exist" fi ``` `guessbd` — map `a2` to `/dev/sda2`, `sda2` to `/dev/sda2`, and a file in general to the block device containing its filesystem. ``` #!/bin/bash f() { if [ -b "$1" ]; then echo "$1" exit 0 fi } f "$1" f "/dev/$1" f "/dev/sd$1" if [ -e "$1" ]; then A="$(df "$1" | tail -n1 | cut -f1 -d\ )" f "$A" fi echo "Failed to guess block device for '$1'" exit 1 ```