Welcome, guest | Sign In | My Account | Store | Cart

Build an index into a directory of xml files. Search the index. Based around the indexing code from Alex Martelli, linux magazine, July 2002.

Python, 85 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import glob, getopt
import fileinput,re,shelve,linecache,sys
from TextSplitter import TextSplitter



#aword = re.compile(r'\b[\w-]+\b')
aword =re.compile (r'<[^<>]*>|\b[\w-]+\b') #using xml as well.
index={}

# Generate an index in file indexFileName

def genIndex(indexFileName, extension):
   fname='*.'+extension
   for line in fileinput.input(glob.glob(fname)):
      location = fileinput.filename(), fileinput.filelineno()
      for word in aword.findall(line.lower()):
         if word[0] != '<':
            index.setdefault(word,[]).append(location)
   shelf = shelve.open(indexFileName,'n')
   for word in index:
      shelf[word] = index[word]
   shelf.close()

# cmd line usage.

def usage():
   print "Usage: \n\txmlIndexer -c filename "
   print "\tto create an index of all xml files in current directory in 'filename'"
   print "\t xmlIndexer -f filename -s searchPattern"
   print "\tto search the current index 'filename' for 'searchPattern'"
   
# main.
if __name__ == '__main__':
   if len(sys.argv) <= 1:
      usage()
      sys.exit(2)
   try:
      opts,args = getopt.getopt(sys.argv[1:],"c:s:f:h",["help","create=","search=","filename="])
   except getopt.GetoptError:
      usage()
      print "Option Exception"
      sys.exit(2)

   indexFile=""
   searchPattern=""
   for o, a in opts:
      #print "o: " + o
      #print "a: " + a
      if o in ("-h","--h","-help"):
         usage()
         sys.exit()
      if o in ("-c","--c","--create"):
         indexFile= a # generate index, set indexfile to arg
         searchPattern="" # ensure no pattern in use
         break
      if o in ("-f","--f","--filename"):
         indexFile = a
      if o in ("-s","--s","--search"):
         searchPattern=a  # set search pattern

   

   #check for pair if searchpattern set then need an index file.
   if searchPattern != "":
      if indexFile == "":
         print "\t Option error. Need an index file to search for a pattern"
         usage()
         sys.exit(2)
      else:                                       # search for pattern in index
         print "Searching for " + searchPattern + " in index " + indexFile
         
         word = searchPattern
         shelf = shelve.open(indexFile, 'r')
         try:
            locations = shelf[word] # was word.lower() to be case ignorant
         except KeyError:
            print word+': not found'
         else:
            print "Word ", word +' is', 
            for file, line in locations:
                print ' in file ' + file +' line:' , line
   else:                         # generate the index
      genIndex(indexFile, 'xml')   
      print "Index generated in file "+indexFile
      

I needed it for a website index. Issues: Probably needs tidying, I like Python, but not very experienced. Alternative would be XSLT, but I also need it for the html produced from the xml :-)

Indexing idea: http://www.linux-mag.com/2002-07/python_01.html