expoweb/1623/204/find_dead_qms.py

"""
Quick and dirty Python script to find references to completed qms in the 
cave description pages. Run this to find which bits of description
need updating.

The list of qms is read from the qm.csv file and any with an entry in the
"Completion description" column (column 7) are searched for in all the html
files.

The script prints a list of the completed qms that it found references to
and in which file.

Nial Peters - 2011
"""
import csv
import re
import glob
import itertools
import os
import os.path

QM_CSV_FILE = "qm.csv"
DESC_FOLDER = "."


#####################################################################
# A few functions copied from std_ops - pasted here to save people
# having to install std_ops to use this script.
#####################################################################

def flatten(l, ltypes=(list, tuple)):
    """
    Reduces any iterable containing other iterables into a single list
    of non-iterable items. The ltypes option allows control over what 
    element types will be flattened. This algorithm is taken from:
    http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html
    
    >>> print flatten([range(3),range(3,6)])
    [0, 1, 2, 3, 4, 5]
    >>> print flatten([1,2,(3,4)])
    [1, 2, 3, 4]
    >>> print flatten([1,[2,3,[4,5,[6,[7,8,[9,[10]]]]]]])
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    >>> print flatten([1,[2,3,[4,5,[6,[7,8,[9,[10]]]]]]], ltypes=())
    [1, [2, 3, [4, 5, [6, [7, 8, [9, [10]]]]]]]
    >>> print flatten([1,2,(3,4)],ltypes=(list))
    [1, 2, (3, 4)]
    """
    ltype = type(l)
    l = list(l)
    i = 0
    while i < len(l):
        while isinstance(l[i], ltypes):
            if not l[i]:
                l.pop(i)
                i -= 1
                break
            else:
                l[i:i + 1] = l[i]
        i += 1
    return ltype(l)
   

def find_files(path, recursive=False, pattern='*', skip_links=True, full_paths=False):
    if not os.path.isdir(path):
        raise ValueError, "\'%s\' is not a recognised folder" %path
    
    found_files = glob.glob(os.path.normpath(path + os.sep + pattern))
    path_contents = glob.glob(os.path.normpath(path + os.sep + '*'))
    
    if skip_links:
        path_contents = [x for x in itertools.ifilterfalse(os.path.islink, path_contents)]
    
    dirs = [x for x in itertools.ifilter(os.path.isdir, path_contents)]
    found_files = [x for x in itertools.ifilterfalse(os.path.isdir, found_files)] #now with no dirs in it  

    if recursive:
        found_files += [find_files(x, recursive, pattern, skip_links, full_paths) for x in dirs]
        
    if full_paths:
        return [os.path.abspath(x) for x in flatten(found_files) if x]
    else:
        return [x for x in flatten(found_files) if x]
       
#####################################################################
#####################################################################
# Main script starts here.
#####################################################################
completed_qms = []

with open(QM_CSV_FILE,'rb') as ifp:
	# read the qm.csv file assuming it is tab delimited
	qm_reader = csv.reader(ifp, delimiter='\t')

	for row in qm_reader:
		if row[6] == "" or row[6].isspace():
			#skip rows that have no entry in the Completion description column
			continue
		completed_qms.append(row[0])

#get a list of all the html files in the description folder
html_files = find_files(DESC_FOLDER, pattern="*.html")

#search each html file for references to each completed qm
for desc_file in html_files:
	with open(desc_file,"r") as f:
		contents = f.read()
		for qm in completed_qms:
			if len(re.findall(qm + "(?!\d)",contents))!=0:
				print "Reference to "+qm + " found in "+desc_file
Nial: added more images to description pages, made all thumbnails equal size, added scrip for finding dead qms in the description pages 2011-10-16 18:12:41 +01:00			`"""`
			`Quick and dirty Python script to find references to completed qms in the`
			`cave description pages. Run this to find which bits of description`
			`need updating.`

			`The list of qms is read from the qm.csv file and any with an entry in the`
			`"Completion description" column (column 7) are searched for in all the html`
			`files.`

			`The script prints a list of the completed qms that it found references to`
			`and in which file.`

			`Nial Peters - 2011`
			`"""`
			`import csv`
			`import re`
			`import glob`
			`import itertools`
			`import os`
			`import os.path`

			`QM_CSV_FILE = "qm.csv"`
			`DESC_FOLDER = "."`


			`#####################################################################`
			`# A few functions copied from std_ops - pasted here to save people`
			`# having to install std_ops to use this script.`
			`#####################################################################`

			`def flatten(l, ltypes=(list, tuple)):`
			`"""`
			`Reduces any iterable containing other iterables into a single list`
			`of non-iterable items. The ltypes option allows control over what`
			`element types will be flattened. This algorithm is taken from:`
			`http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html`

			`>>> print flatten([range(3),range(3,6)])`
			`[0, 1, 2, 3, 4, 5]`
			`>>> print flatten([1,2,(3,4)])`
			`[1, 2, 3, 4]`
			`>>> print flatten([1,[2,3,[4,5,[6,[7,8,[9,[10]]]]]]])`
			`[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`
			`>>> print flatten([1,[2,3,[4,5,[6,[7,8,[9,[10]]]]]]], ltypes=())`
			`[1, [2, 3, [4, 5, [6, [7, 8, [9, [10]]]]]]]`
			`>>> print flatten([1,2,(3,4)],ltypes=(list))`
			`[1, 2, (3, 4)]`
			`"""`
			`ltype = type(l)`
			`l = list(l)`
			`i = 0`
			`while i < len(l):`
			`while isinstance(l[i], ltypes):`
			`if not l[i]:`
			`l.pop(i)`
			`i -= 1`
			`break`
			`else:`
			`l[i:i + 1] = l[i]`
			`i += 1`
			`return ltype(l)`


			`def find_files(path, recursive=False, pattern='*', skip_links=True, full_paths=False):`
			`if not os.path.isdir(path):`
			`raise ValueError, "\'%s\' is not a recognised folder" %path`

			`found_files = glob.glob(os.path.normpath(path + os.sep + pattern))`
			`path_contents = glob.glob(os.path.normpath(path + os.sep + '*'))`

			`if skip_links:`
			`path_contents = [x for x in itertools.ifilterfalse(os.path.islink, path_contents)]`

			`dirs = [x for x in itertools.ifilter(os.path.isdir, path_contents)]`
			`found_files = [x for x in itertools.ifilterfalse(os.path.isdir, found_files)] #now with no dirs in it`

			`if recursive:`
			`found_files += [find_files(x, recursive, pattern, skip_links, full_paths) for x in dirs]`

			`if full_paths:`
			`return [os.path.abspath(x) for x in flatten(found_files) if x]`
			`else:`
			`return [x for x in flatten(found_files) if x]`

			`#####################################################################`
			`#####################################################################`
			`# Main script starts here.`
			`#####################################################################`
			`completed_qms = []`

			`with open(QM_CSV_FILE,'rb') as ifp:`
			`# read the qm.csv file assuming it is tab delimited`
			`qm_reader = csv.reader(ifp, delimiter='\t')`

			`for row in qm_reader:`
			`if row[6] == "" or row[6].isspace():`
			`#skip rows that have no entry in the Completion description column`
			`continue`
			`completed_qms.append(row[0])`

			`#get a list of all the html files in the description folder`
			`html_files = find_files(DESC_FOLDER, pattern="*.html")`

			`#search each html file for references to each completed qm`
			`for desc_file in html_files:`
			`with open(desc_file,"r") as f:`
			`contents = f.read()`
			`for qm in completed_qms:`
			`if len(re.findall(qm + "(?!\d)",contents))!=0:`
			`print "Reference to "+qm + " found in "+desc_file`