Build an index into a directory of xml files. Search the index. Based around the indexing code from Alex Martelli, linux magazine, July 2002.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | import glob, getopt
import fileinput,re,shelve,linecache,sys
from TextSplitter import TextSplitter
#aword = re.compile(r'\b[\w-]+\b')
aword =re.compile (r'<[^<>]*>|\b[\w-]+\b') #using xml as well.
index={}
# Generate an index in file indexFileName
def genIndex(indexFileName, extension):
   fname='*.'+extension
   for line in fileinput.input(glob.glob(fname)):
      location = fileinput.filename(), fileinput.filelineno()
      for word in aword.findall(line.lower()):
         if word[0] != '<':
            index.setdefault(word,[]).append(location)
   shelf = shelve.open(indexFileName,'n')
   for word in index:
      shelf[word] = index[word]
   shelf.close()
# cmd line usage.
def usage():
   print "Usage: \n\txmlIndexer -c filename "
   print "\tto create an index of all xml files in current directory in 'filename'"
   print "\t xmlIndexer -f filename -s searchPattern"
   print "\tto search the current index 'filename' for 'searchPattern'"
   
# main.
if __name__ == '__main__':
   if len(sys.argv) <= 1:
      usage()
      sys.exit(2)
   try:
      opts,args = getopt.getopt(sys.argv[1:],"c:s:f:h",["help","create=","search=","filename="])
   except getopt.GetoptError:
      usage()
      print "Option Exception"
      sys.exit(2)
   indexFile=""
   searchPattern=""
   for o, a in opts:
      #print "o: " + o
      #print "a: " + a
      if o in ("-h","--h","-help"):
         usage()
         sys.exit()
      if o in ("-c","--c","--create"):
         indexFile= a # generate index, set indexfile to arg
         searchPattern="" # ensure no pattern in use
         break
      if o in ("-f","--f","--filename"):
         indexFile = a
      if o in ("-s","--s","--search"):
         searchPattern=a  # set search pattern
   
   #check for pair if searchpattern set then need an index file.
   if searchPattern != "":
      if indexFile == "":
         print "\t Option error. Need an index file to search for a pattern"
         usage()
         sys.exit(2)
      else:                                       # search for pattern in index
         print "Searching for " + searchPattern + " in index " + indexFile
         
         word = searchPattern
         shelf = shelve.open(indexFile, 'r')
         try:
            locations = shelf[word] # was word.lower() to be case ignorant
         except KeyError:
            print word+': not found'
         else:
            print "Word ", word +' is', 
            for file, line in locations:
                print ' in file ' + file +' line:' , line
   else:                         # generate the index
      genIndex(indexFile, 'xml')   
      print "Index generated in file "+indexFile
      
 | 
I needed it for a website index. Issues: Probably needs tidying, I like Python, but not very experienced. Alternative would be XSLT, but I also need it for the html produced from the xml :-)
Indexing idea: http://www.linux-mag.com/2002-07/python_01.html

 Download
Download Copy to clipboard
Copy to clipboard
