expoweb/1623/204/find_dead_qms.py

"""
Quick and dirty Python script to find references to completed qms in the
cave description pages. Run this to find which bits of description
need updating.

The list of qms is read from the qm.csv file and any with an entry in the
"Completion description" column (column 7) are searched for in all the html
files.

The script prints a list of the completed qms that it found references to
and in which file.

Nial Peters - 2011
"""
import csv
import re
import glob
import itertools
import os
import os.path

QM_CSV_FILE = "qm.csv"
DESC_FOLDER = "."


#####################################################################
# A few functions copied from std_ops - pasted here to save people
# having to install std_ops to use this script.
#####################################################################

def flatten(l, ltypes=(list, tuple)):
    """
    Reduces any iterable containing other iterables into a single list
    of non-iterable items. The ltypes option allows control over what
    element types will be flattened. This algorithm is taken from:
    http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html

    >>> print flatten([range(3),range(3,6)])
    [0, 1, 2, 3, 4, 5]
    >>> print flatten([1,2,(3,4)])
    [1, 2, 3, 4]
    >>> print flatten([1,[2,3,[4,5,[6,[7,8,[9,[10]]]]]]])
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    >>> print flatten([1,[2,3,[4,5,[6,[7,8,[9,[10]]]]]]], ltypes=())
    [1, [2, 3, [4, 5, [6, [7, 8, [9, [10]]]]]]]
    >>> print flatten([1,2,(3,4)],ltypes=(list))
    [1, 2, (3, 4)]
    """
    ltype = type(l)
    l = list(l)
    i = 0
    while i < len(l):
        while isinstance(l[i], ltypes):
            if not l[i]:
                l.pop(i)
                i -= 1
                break
            else:
                l[i:i + 1] = l[i]
        i += 1
    return ltype(l)


def find_files(path, recursive=False, pattern='*', skip_links=True, full_paths=False):
    if not os.path.isdir(path):
        raise ValueError, "\'%s\' is not a recognised folder" %path

    found_files = glob.glob(os.path.normpath(path + os.sep + pattern))
    path_contents = glob.glob(os.path.normpath(path + os.sep + '*'))

    if skip_links:
        path_contents = [x for x in itertools.ifilterfalse(os.path.islink, path_contents)]

    dirs = [x for x in itertools.ifilter(os.path.isdir, path_contents)]
    found_files = [x for x in itertools.ifilterfalse(os.path.isdir, found_files)] #now with no dirs in it

    if recursive:
        found_files += [find_files(x, recursive, pattern, skip_links, full_paths) for x in dirs]

    if full_paths:
        return [os.path.abspath(x) for x in flatten(found_files) if x]
    else:
        return [x for x in flatten(found_files) if x]

#####################################################################
#####################################################################
# Main script starts here.
#####################################################################
completed_qms = []

with open(QM_CSV_FILE,'rb') as ifp:
	# read the qm.csv file assuming it is tab delimited
	qm_reader = csv.reader(ifp, delimiter='\t')

	for row in qm_reader:
		if row[6] == "" or row[6].isspace():
			#skip rows that have no entry in the Completion description column
			continue
		completed_qms.append(row[0])

#get a list of all the html files in the description folder
html_files = find_files(DESC_FOLDER, pattern="*.html")

#search each html file for references to each completed qm
for desc_file in html_files:
	with open(desc_file,"r") as f:
		contents = f.read()
		for qm in completed_qms:
			if len(re.findall(qm + "(?!\d)",contents))!=0:
				print "Reference to "+qm + " found in "+desc_file