expoweb/1623/204/find_dead_qms.py

114 lines
3.6 KiB
Python
Raw Normal View History

"""
Quick and dirty Python script to find references to completed qms in the
cave description pages. Run this to find which bits of description
need updating.
The list of qms is read from the qm.csv file and any with an entry in the
"Completion description" column (column 7) are searched for in all the html
files.
The script prints a list of the completed qms that it found references to
and in which file.
Nial Peters - 2011
"""
import csv
import re
import glob
import itertools
import os
import os.path
QM_CSV_FILE = "qm.csv"
DESC_FOLDER = "."
#####################################################################
# A few functions copied from std_ops - pasted here to save people
# having to install std_ops to use this script.
#####################################################################
def flatten(l, ltypes=(list, tuple)):
"""
Reduces any iterable containing other iterables into a single list
of non-iterable items. The ltypes option allows control over what
element types will be flattened. This algorithm is taken from:
http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html
>>> print flatten([range(3),range(3,6)])
[0, 1, 2, 3, 4, 5]
>>> print flatten([1,2,(3,4)])
[1, 2, 3, 4]
>>> print flatten([1,[2,3,[4,5,[6,[7,8,[9,[10]]]]]]])
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
>>> print flatten([1,[2,3,[4,5,[6,[7,8,[9,[10]]]]]]], ltypes=())
[1, [2, 3, [4, 5, [6, [7, 8, [9, [10]]]]]]]
>>> print flatten([1,2,(3,4)],ltypes=(list))
[1, 2, (3, 4)]
"""
ltype = type(l)
l = list(l)
i = 0
while i < len(l):
while isinstance(l[i], ltypes):
if not l[i]:
l.pop(i)
i -= 1
break
else:
l[i:i + 1] = l[i]
i += 1
return ltype(l)
def find_files(path, recursive=False, pattern='*', skip_links=True, full_paths=False):
if not os.path.isdir(path):
raise ValueError, "\'%s\' is not a recognised folder" %path
found_files = glob.glob(os.path.normpath(path + os.sep + pattern))
path_contents = glob.glob(os.path.normpath(path + os.sep + '*'))
if skip_links:
path_contents = [x for x in itertools.ifilterfalse(os.path.islink, path_contents)]
dirs = [x for x in itertools.ifilter(os.path.isdir, path_contents)]
found_files = [x for x in itertools.ifilterfalse(os.path.isdir, found_files)] #now with no dirs in it
if recursive:
found_files += [find_files(x, recursive, pattern, skip_links, full_paths) for x in dirs]
if full_paths:
return [os.path.abspath(x) for x in flatten(found_files) if x]
else:
return [x for x in flatten(found_files) if x]
#####################################################################
#####################################################################
# Main script starts here.
#####################################################################
completed_qms = []
with open(QM_CSV_FILE,'rb') as ifp:
# read the qm.csv file assuming it is tab delimited
qm_reader = csv.reader(ifp, delimiter='\t')
for row in qm_reader:
if row[6] == "" or row[6].isspace():
#skip rows that have no entry in the Completion description column
continue
completed_qms.append(row[0])
#get a list of all the html files in the description folder
html_files = find_files(DESC_FOLDER, pattern="*.html")
#search each html file for references to each completed qm
for desc_file in html_files:
with open(desc_file,"r") as f:
contents = f.read()
for qm in completed_qms:
if len(re.findall(qm + "(?!\d)",contents))!=0:
print "Reference to "+qm + " found in "+desc_file