""" Quick and dirty Python script to find references to completed qms in the cave description pages. Run this to find which bits of description need updating. The list of qms is read from the qm.csv file and any with an entry in the "Completion description" column (column 7) are searched for in all the html files. The script prints a list of the completed qms that it found references to and in which file. Nial Peters - 2011 """ import csv import re import glob import itertools import os import os.path QM_CSV_FILE = "qm.csv" DESC_FOLDER = "." ##################################################################### # A few functions copied from std_ops - pasted here to save people # having to install std_ops to use this script. ##################################################################### def flatten(l, ltypes=(list, tuple)): """ Reduces any iterable containing other iterables into a single list of non-iterable items. The ltypes option allows control over what element types will be flattened. This algorithm is taken from: http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html >>> print flatten([range(3),range(3,6)]) [0, 1, 2, 3, 4, 5] >>> print flatten([1,2,(3,4)]) [1, 2, 3, 4] >>> print flatten([1,[2,3,[4,5,[6,[7,8,[9,[10]]]]]]]) [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] >>> print flatten([1,[2,3,[4,5,[6,[7,8,[9,[10]]]]]]], ltypes=()) [1, [2, 3, [4, 5, [6, [7, 8, [9, [10]]]]]]] >>> print flatten([1,2,(3,4)],ltypes=(list)) [1, 2, (3, 4)] """ ltype = type(l) l = list(l) i = 0 while i < len(l): while isinstance(l[i], ltypes): if not l[i]: l.pop(i) i -= 1 break else: l[i:i + 1] = l[i] i += 1 return ltype(l) def find_files(path, recursive=False, pattern='*', skip_links=True, full_paths=False): if not os.path.isdir(path): raise ValueError, "\'%s\' is not a recognised folder" %path found_files = glob.glob(os.path.normpath(path + os.sep + pattern)) path_contents = glob.glob(os.path.normpath(path + os.sep + '*')) if skip_links: path_contents = [x for x in itertools.ifilterfalse(os.path.islink, path_contents)] dirs = [x for x in itertools.ifilter(os.path.isdir, path_contents)] found_files = [x for x in itertools.ifilterfalse(os.path.isdir, found_files)] #now with no dirs in it if recursive: found_files += [find_files(x, recursive, pattern, skip_links, full_paths) for x in dirs] if full_paths: return [os.path.abspath(x) for x in flatten(found_files) if x] else: return [x for x in flatten(found_files) if x] ##################################################################### ##################################################################### # Main script starts here. ##################################################################### completed_qms = [] with open(QM_CSV_FILE,'rb') as ifp: # read the qm.csv file assuming it is tab delimited qm_reader = csv.reader(ifp, delimiter='\t') for row in qm_reader: if row[6] == "" or row[6].isspace(): #skip rows that have no entry in the Completion description column continue completed_qms.append(row[0]) #get a list of all the html files in the description folder html_files = find_files(DESC_FOLDER, pattern="*.html") #search each html file for references to each completed qm for desc_file in html_files: with open(desc_file,"r") as f: contents = f.read() for qm in completed_qms: if len(re.findall(qm + "(?!\d)",contents))!=0: print "Reference to "+qm + " found in "+desc_file