# -*- coding: utf-8 -*- ############################################################################### # KB_CAT KNOWLEDGE DISCOVERY IN DATA MINING (CATALOG PROGRAM) # # by ROBERTO BELLO (COPYRIGHT MARCH 2011 ALL RIGHTS RESERVED) # # Language used: PYTHON . # ############################################################################### import os import random import copy import datetime def mean(x): # mean n = len(x) mean = sum(x) / n return mean def sd(x): # standard deviattion n = len(x) mean = sum(x) / n sd = (sum((x-mean)**2 for x in x) / n) ** 0.5 return sd def is_number(s): try: float(s) return True except ValueError: return False class ndim: # from 3D array to flat array def __init__(self,x,y,z,d): self.dimensions=[x,y,z] self.numdimensions=d self.gridsize=x*y*z def getcellindex(self, location): cindex = 0 cdrop = self.gridsize for index in xrange(self.numdimensions): cdrop /= self.dimensions[index] cindex += cdrop * location[index] return cindex def getlocation(self, cellindex): res = [] for size in reversed(self.dimensions): res.append(cellindex % size) cellindex /= size return res[::-1] """ how to use ndim class n=ndim(4,4,5,3) print n.getcellindex((0,0,0)) print n.getcellindex((0,0,1)) print n.getcellindex((0,1,0)) print n.getcellindex((1,0,0)) print n.getlocation(20) print n.getlocation(5) print n.getlocation(1) print n.getlocation(0) """ print("###############################################################################") print("# KB_CAT KNOWLEDGE DISCOVERY IN DATA MINING (CATALOG PROGRAM) #") print("# by ROBERTO BELLO (COPYRIGHT MARCH 2011 ALL RIGHTS RESERVED) #") print("# Language used: PYTHON #") print("###############################################################################") # input and run parameters error = 0 while True: arch_input = raw_input('InputFile : ') if not os.path.isfile(arch_input): print("Oops! File does not exist. Try again... or CTR/C to exit") else: break while True: try: num_gruppi = int(raw_input('Number of Groups (3 - 20) : ')) except ValueError: print("Oops! That was no valid number. Try again...") else: if(num_gruppi < 3): print("Oops! Number of Groups too low. Try again...") else: if(num_gruppi > 20): print("Oops! Number of Groups too big. Try again...") else: break while True: normaliz = raw_input('Normalization(Max, Std, None) : ') normaliz = normaliz.upper() normaliz = normaliz[0] if(normaliz <> 'M' and normaliz <> 'S' and normaliz <> 'N'): print("Oops! Input M, S or N. Try again...") else: break while True: try: max_alpha = float(raw_input('Start value of alpha (1.8 - 0.9) : ')) except ValueError: print("Oops! That was no valid number. Try again...") else: if(max_alpha > 1.8): print("Oops! Start value of alpha too big. Try again...") else: if(max_alpha < 0.9): print("Oops! Start value of alpha too low. Try again...") else: break while True: try: min_alpha = float(raw_input('End value of alpha (0.5 - 0.0001) : ')) except ValueError: print("Oops! That was no valid number. Try again...") else: if(min_alpha > 0.5): print("Oops! alpha too big. Try again...") else: if(min_alpha < 0.0001): print("Oops! alpha too low. Try again...") else: break while True: try: step_alpha = float(raw_input('Decreasing step of alpha (0.1 - 0.001) : ')) except ValueError: print("Oops! That was no valid number. Try again...") else: if(step_alpha > 0.1): print("Oops! Decreasing step of alpha too big. Try again...") else: if(step_alpha < 0.001): print("Oops! Decreasing step of alpha too low. Try again...") else: break file_input = arch_input gruppi_num = num_gruppi tipo_norm = normaliz alpha_min = min_alpha alpha_max = max_alpha alpha_step = step_alpha # outputs files file_input = arch_input tipo_norm = normaliz gruppi_num = num_gruppi nome_input = file_input.split(".") arch_output = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_out.txt" arch_outsrt = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_outsrt.txt" arch_sort = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_sort.txt" arch_catal = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_catal.txt" arch_medsd = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_medsd.txt" arch_cv = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_cv.txt" arch_grid = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_grid.txt" arch_log = nome_input[0] + "_" + tipo_norm + "_g" + str(gruppi_num) + "_log.txt" # start time t0 = datetime.datetime.now() # read input file arr_r = [] arr_orig = [] arr_c = [] mtchx = [] mtchy = [] txt_col = [] xnomi = [] # the numbers of variables / columns in all record must be the same n_rows = 0 n_cols = 0 err_cols = 0 index = 0 for line in open(file_input).readlines(): linea = line.split() if(index == 0): xnomi.append(linea) n_cols = len(linea) else: arr_r.append(linea) if(len(linea) != n_cols): err_cols = 1 print("Different numbers of variables / columns in the record " + str(index) + " cols " + str(len(linea))) index += 1 if(err_cols == 1): print("File " + file_input + " contains errors. Exit ") quit() index = 0 while index < len(arr_r): linea = arr_r[index] index_c = 0 while index_c < len(linea): if linea[index_c].isdigit(): linea[index_c] = float(linea[index_c]) index_c += 1 arr_r[index] = linea index += 1 arr_orig = copy.deepcopy(arr_r) # original input file testata_cat = copy.deepcopy(xnomi[0]) # original header row # finding columns containing strings and columns containing numbers testata = xnomi[0] testata_orig = copy.deepcopy(xnomi[0]) n_cols = len(testata) - 1 n_rows = len(arr_r) ind_c = 1 err_type = 0 while ind_c < len(testata): ind_r = 1 tipo_num = 0 tipo_txt = 0 while ind_r < len(arr_r): arr_c = arr_r[ind_r] if is_number(arr_c[ind_c]): tipo_num = 1 else: tipo_txt = 1 ind_r += 1 if tipo_num == 1 and tipo_txt == 1: print "The columns / variables " + testata[ind_c] + " contains both strings and numbers." print arr_c err_type = 1 ind_c += 1 if err_type == 1: print "Oops! The columns / variables contains both strings and numbers. Exit. " quit() index_c = 1 while index_c <= n_cols: txt_col = [] index = 0 while index < len(arr_r): arr_c = arr_r[index] if(isinstance(arr_c[index_c],str)): txt_col.append(arr_c[index_c]) index += 1 set_txt_col = set(txt_col) # remove duplicates txt_col = list(set(set_txt_col)) txt_col.sort() # from strings to numbers if(len(txt_col) > 0): if(len(txt_col) > 1): passo1 = 1.0 / (len(txt_col) - 1) else: passo1 = 0.0 index = 0 while index < len(arr_r): arr_c = arr_r[index] campo1 = arr_c[index_c] indice1 = txt_col.index(campo1) if(len(txt_col) == 1): # same values in the column val_num1 = float(1) else: val_num1 = float(passo1 * indice1) arr_c[index_c] = val_num1 + 0.00000001 # to avoid zero values in means # (to prevent zero divide in CV) index += 1 index_c += 1 # means, max & std xmeans = [] xmaxs = [] xmins = [] ### aggiunto Roberto 4/03/2012 xsds = [] xcv = [] index_c = 0 while index_c <= n_cols: xmeans.append(0.0) xmaxs.append(-9999999999999999.9) xmins.append(9999999999999999.9) ### aggiunto Roberto 4/03/2012 xsds.append(0.0) xcv.append(0.0) index_c += 1 # means & max index = 0 while index < n_rows: arr_c = arr_r[index] index_c = 1 while index_c <= n_cols: xmeans[index_c] += arr_c[index_c] if(arr_c[index_c] > xmaxs[index_c]): xmaxs[index_c] = arr_c[index_c] index_c += 1 index += 1 index_c = 1 while index_c <= n_cols: xmeans[index_c] = xmeans[index_c] / n_rows index_c += 1 # std index = 0 while index < n_rows: arr_c = arr_r[index] index_c = 1 while index_c <= n_cols: xsds[index_c] += (arr_c[index_c] - xmeans[index_c])**2 index_c += 1 index += 1 index_c = 1 while index_c <= n_cols: xsds[index_c] = (xsds[index_c] / (n_cols - 1)) ** 0.5 index_c += 1 # Means, Max, Std, CV output file medsd_file = open(arch_medsd, 'w') # columns names medsd_file.write('%s %s ' % ('Function' , "\t")) index_c = 1 while index_c <= n_cols: medsd_file.write('%s %s ' % (testata[index_c], "\t")) index_c += 1 medsd_file.write('%s' % ('\n')) # means medsd_file.write('%s %s ' % ('Mean' , "\t")) index_c = 1 while index_c <= n_cols: valore = str(xmeans[index_c]) valore = valore[0:6] medsd_file.write('%s %s ' % (valore, "\t")) index_c += 1 medsd_file.write('%s' % ('\n')) # max medsd_file.write('%s %s ' % ('Max' , "\t")) index_c = 1 while index_c <= n_cols: valore = str(xmaxs[index_c]) valore = valore[0:6] medsd_file.write('%s %s ' % (valore, "\t")) index_c += 1 medsd_file.write('%s' % ('\n')) # std medsd_file.write('%s %s ' % ('Std' , "\t")) index_c = 1 while index_c <= n_cols: valore = str(xsds[index_c]) valore = valore[0:6] medsd_file.write('%s %s ' % (valore, "\t")) index_c += 1 medsd_file.write('%s' % ('\n')) # CV medsd_file.write('%s %s ' % ('CV' , "\t")) index_c = 1 med_cv_gen = 0.0 # cv average of all columns / variables while index_c <= n_cols: if xmeans[index_c] == 0: media1 = 0.000001 else: media1 = xmeans[index_c] xcv[index_c] = xsds[index_c] / abs(media1) valore = str(xcv[index_c]) med_cv_gen += xcv[index_c] valore = valore[0:6] medsd_file.write('%s %s ' % (valore, "\t")) index_c += 1 med_cv_gen = med_cv_gen / n_cols str_med_cv_gen = str(med_cv_gen) str_med_cv_gen = str_med_cv_gen[0:6] medsd_file.write('%s' % ('\n')) medsd_file.close() # input standardization # standardization on max if tipo_norm == 'M': index = 0 while index < n_rows: arr_c = arr_r[index] index_c = 1 while index_c <= n_cols: ## aggiornare anche kb_cla.py if xmaxs[index_c] == 0.0: xmaxs[index_c] = 0.00001 arr_c[index_c] = arr_c[index_c] / xmaxs[index_c] index_c += 1 index += 1 # standardization on std if tipo_norm == 'S': index = 0 while index < n_rows: arr_c = arr_r[index] index_c = 1 while index_c <= n_cols: if xsds[index_c] == 0.0: xsds[index_c] = 0.00001 arr_c[index_c] = (arr_c[index_c] - xmeans[index_c]) / xsds[index_c] if arr_c[index_c] < xmins[index_c]: ### aggiunto Roberto 4/03/2012 xmins[index_c] = arr_c[index_c] ### aggiunto Roberto 4/03/2012 index_c += 1 index += 1 # aggiungo xmins per eliminare i valori negativi (aggiunto da Roberto 4/03/2012) index = 0 while index < n_rows: arr_c = arr_r[index] index_c = 1 while index_c <= n_cols: arr_c[index_c] = arr_c[index_c] - xmins[index_c] print arr_c[index_c] index_c += 1 index += 1 # fine aggiunta da Roberto 4/03/2012 # start of kohonen algorithm # min and max vectors vmaxs = [] vmins = [] index_c = 0 while index_c <= n_cols: vmaxs.append(-10000000000000.0) vmins.append( 10000000000000.0) index_c += 1 # columns min & max index = 0 while index < n_rows: arr_c = arr_r[index] index_c = 1 while index_c <= n_cols: if arr_c[index_c] > vmaxs[index_c]: vmaxs[index_c] = arr_c[index_c] if arr_c[index_c] < vmins[index_c]: vmins[index_c] = arr_c[index_c] index_c += 1 index += 1 # run parameters and temp arrays n = n_rows m = n_cols nx = gruppi_num ny = gruppi_num ix = 950041 # integer as random seed nsteps = int(10000 * nx * ny) # number of steps nepoks = int(nsteps / n ** 0.5) # number of epochs unit_calc = int(n * m * nx * ny) # running units passo = int(5000 / n) # step of visualization on monitor rmax = nx - 1 rmin = 1.0 if passo < 1: passo = 1 grid = [] # training grid index = 0 while index < nx * ny * m: grid.append(0.0) index += 1 n=ndim(nx,ny,m,3) random.seed(ix) # initial value of random seed to obtain the same sequences in new runs index = 0 while index < nx: index_c = 0 while index_c < ny: index_k = 0 while index_k < m: ig = n.getcellindex((index,index_c,index_k)) grid[ig] = random.random() index_k += 1 index_c += 1 index += 1 gridp = copy.deepcopy(grid) # initial previous grid = current grid gridm = copy.deepcopy(grid) # initial min grid = current grid # for each record in each epoch iter = 0 discrea = 1000000000000.0 # current error discrep = 0.0 # previous error if nepoks < 20: nepoks = 20 # min epochs = 20 nepokx = 0 min_epok = 0 # epoch with min error min_err = 1000000000.0 # min error alpha = float(alpha_max) # initial value of alpha parameter ir = 0.0 # initial value of ir parameter ir ne = 1 print " " print 'Record ' + str(n_rows) + ' Columns ' + str(n_cols) # main loop try: while ne <= nepoks: if (ne % passo == 0): # print running message when modulo division = zero min_err_txt = "%14.5f" % min_err # format 8 integers and 3 decimals alpha_txt = "%12.5f" % alpha # format 6 integers and 5 decimals print ('Epoch ' + str(ne) + ' min err ' + min_err_txt + ' min epoch ' + str(min_epok - 1) + " alpha " + alpha_txt) if min_err < 1000000000.0: nepokx += 1 if min_err > discrea and discrep > discrea and discrea > 0.0: min_epok = ne # current epoch (min) min_err = discrea # copy current grid to min grid gridm = copy.deepcopy(grid) min_err_txt = "%12.3f" % min_err # format 8 integers and 3 decimals alpha_txt = "%12.5f" % alpha # format 6 integer and 5 decimals print ('**** Epoch ' + str(ne - 1) + ' WITH MIN ERROR ' + min_err_txt + " alpha " + alpha_txt) # cheking the current value of alpha if alpha > alpha_min: discrea = discrep discrep = 0.0 # copy current grid to previous grid gridp = copy.deepcopy(grid) # from the starting row to the ending row i = 0 while i < n_rows: iter += 1 # find the best grid coefficient ihit = 0 jhit = 0 dhit = 100000.0 igx = 0 igy = 0 while igx < nx: igy = 0 while igy < ny: d = 0.0 neff = 0 k = 0 arr_c = arr_r[i] while k < m: # update the sum of squared deviation of input # value from the grid coefficient ig = n.getcellindex((igx,igy,k)) d = d + (arr_c[k+1] - grid[ig]) ** 2 k += 1 d = d / float(m) # d = d / m if d < dhit: dhit = d ihit = int(igx) jhit = int(igy) igy += 1 igx += 1 # update iteration error discrep = discrep + dhit # now we have the coordinates of the best grid coefficient ir = max(rmax * float(1001 - iter) / 1000.0 + 0.9999999999 , 1) ir = int(ir) # new alpha value to increase the radius of groups proximity alpha = max(alpha_max * float(1 - ne * alpha_step) , alpha_min) # update the grid coefficients applying alpha parameter inn0 = int(ihit) - int(ir) inn9 = int(ihit) + int(ir) jnn0 = int(jhit) - int(ir) jnn9 = int(jhit) + int(ir) while inn0 <= inn9: jnn0 = int(jhit) - int(ir) while jnn0 <= jnn9: if not (inn0 < 0 or inn0 >= nx): if not (jnn0 < 0 or jnn0 >= ny): arr_c = arr_r[i] k = 0 while k < m: ig = n.getcellindex((inn0,jnn0,k)) grid[ig] += alpha * (arr_c[k+1] - grid[ig]) k += 1 jnn0 += 1 inn0 += 1 i += 1 else: print print "Min alpha reached " print break ne += 1 except KeyboardInterrupt: print print "KeyboardInterrupt (Ctrl/C) " print pass # computing results # grid = grid min grid = copy.deepcopy(gridm) # write min grid file arch_grid_file = open(arch_grid, 'w') ii = 0 while ii < nx: j = 0 while j < ny: k = 0 while k < m: ig = n.getcellindex((ii,j,k)) arch_grid_file.write('%6i %s %.6i %s %.6i %s %14.7f %s' % (ii,' ', j ,' ', k,' ', grid[ig], "\n")) k += 1 j += 1 ii += 1 arch_grid_file.close() # catalog input by min grid ii = 0 while ii < n_rows: ihit = 0 jhit = 0 dhit = 100000.0 # from 1 to numbers of groups ir = 0 while ir < nx: # from 1 to numbers of groups jc = 0 while jc < ny: # from 1 to numbers of groups d = 0.0 neff = 0 k = 0 while k < n_cols: # update the sum of squared deviation of input # value from the grid coefficient arr_c = arr_r[ii] ig = n.getcellindex((ir,jc,k)) d = d + (arr_c[k+1] - grid[ig]) ** 2 k += 1 d = d / m if d < dhit: # save the coordinates of the best coefficient dhit = d ihit = ir jhit = jc jc += 1 ir += 1 mtchx.append(ihit) mtchy.append(jhit) ii += 1 # write arch_catal file arch_catal_file = open(arch_catal, 'w') ii = 0 while ii < n_rows: arch_catal_file.write("%.6i %s %.6i %s %.6i %s" % (ii, ' ', mtchx[ii], ' ', mtchy[ii], "\n")) ii += 1 arch_catal_file.close() # matrix of statistics arr_cv = [] # CV array of the Groups and Total arr_med = [] # means array of the Groups riga_cv = [] # CV row in arr_cv arr_col = [] # group temporary array arr_grsg = [] # input data array (normalized) arr_grsg_c = [] # copy of arr_grsg (for file out sort) # input matrix sort in group sequence ii = 0 ix = 0 while ii < n_rows: ix += 1 gr1 = str(mtchx[ii]) if mtchx[ii] < 10: gr1 = '0' + str(mtchx[ii]) sg1 = str(mtchy[ii]) if mtchy[ii] < 10: sg1 = '0' + str(mtchy[ii]) riga_norm = arr_r[ii] im = 0 riga_norm1 = [] while im <= m: riga_norm1.append(str(riga_norm[im])) im += 1 riga_norm2 = " ".join(riga_norm1) gr_sg_txt = "G_" + gr1 + "_" + sg1 + " " + str(ix) + " " + riga_norm2 arr_grsg.append(gr_sg_txt) ii += 1 arr_grsg.sort() ii = 0 while ii < n_rows: arr_grsg_c.append(arr_grsg[ii]) ii += 1 # setup of arr_cv matrix num_gr = 0 gruppo0 = "" ir = 0 while ir < n_rows: grsg_key = arr_grsg_c[ir].split() if not grsg_key[0] == gruppo0: gruppo0 = grsg_key[0] num_gr +=1 ic = 1 riga1 = [] riga1.append(grsg_key[0]) while ic <= m + 2: # adding new columns for row mean and n° of records riga1.append(0.0) ic += 1 arr_cv.append(riga1) # cv row ir += 1 riga1 = [] riga1.append("*Means*") # adding new row for cv mean ic = 1 while ic <= m + 2: # adding new column for row mean and n° of records riga1.append(0.0) ic += 1 arr_cv.append(riga1) def found(x): ir = 0 while ir < len(arr_cv): linea_cv = arr_cv[ir] key_cv = linea_cv[0] if key_cv == x: return ir ir += 1 ir = 0 irx = len(arr_grsg_c) ic = 3 linea_cv = arr_cv[0] icx = len(linea_cv) val_col = [] while ic < icx: ir = 0 gruppo = "" val_col = [] while ir < irx: linea = arr_grsg_c[ir].split() if linea[0] == gruppo or gruppo == "": gruppo = linea[0] val_col.append(float(linea[ic])) else: i_gruppo = found(gruppo) linea_cv = arr_cv[i_gruppo] media_v = abs(mean(val_col)) if media_v == 0.0: media_v = 0.0000000001 std_v = sd(val_col) cv_v = std_v / media_v linea_cv[ic-2] = cv_v # cv value linea_cv[len(linea_cv)-1] = len(val_col) # number of records val_col = [] val_col.append(float(linea[ic])) gruppo = linea[0] ir += 1 i_gruppo = found(gruppo) linea_cv = arr_cv[i_gruppo] media_v = abs(mean(val_col)) if media_v == 0.0: media_v = 0.0000000001 std_v = sd(val_col) cv_v = std_v / media_v linea_cv[ic-2] = cv_v # cv value linea_cv[len(linea_cv)-1] = len(val_col) # number of records ic += 1 ir = 0 irx = len(arr_cv) linea_cv = arr_cv[0] icx = len(linea_cv) - 2 ic = 1 num_rec1 = 0 while ir < irx: # rows mean media_riga = 0.0 ic = 1 num_col1 = 0 linea_cv = arr_cv[ir] while ic < icx: media_riga += float(linea_cv[ic]) num_col1 += 1 ic += 1 linea_cv[icx] = media_riga / num_col1 num_rec1 += linea_cv[icx + 1] ir += 1 ir = 0 ic = 1 while ic < icx: # weighted mean of columns media_col = 0.0 ir = 0 num_rec1 = 0 while ir < irx - 1: linea_cv = arr_cv[ir] media_col = media_col + linea_cv[ic] * linea_cv[icx+1] # linea_cv[icx+1] = number of records num_rec1 = num_rec1 + linea_cv[icx+1] ir += 1 linea_cv = arr_cv[irx - 1] linea_cv[ic] = media_col / num_rec1 ic += 1 # updating mean of the row linea_cv = arr_cv[irx - 1] linea_means = linea_cv[1:icx] media_riga = mean(linea_means) linea_cv[icx] = media_riga # Total mean linea_cv[icx + 1] = num_rec1 # n° of records cv_media_gen_after = str(media_riga) cv_media_gen_after = cv_media_gen_after[0:6] # write cv file testata_cv = testata testata_cv[0] = "*Groups*" testata_cv.append("*Mean*") testata_cv.append("N_recs") arch_cv_file = open(arch_cv, 'w') ic = 0 while ic <= icx + 1: arch_cv_file.write('%s %s ' % (testata_cv[ic], " "*(9-len(testata_cv[ic])))) ic += 1 arch_cv_file.write('%s' % ('\n')) ir = 0 while ir < irx: ic = 0 linea_cv = arr_cv[ir] while ic <= icx + 1: if ic == 0: arch_cv_file.write('%s %s ' % (linea_cv[0], " ")) else: if ic <= icx: arch_cv_file.write('%7.4f %s ' % (linea_cv[ic], " ")) else: arch_cv_file.write('%6i %s ' % (linea_cv[ic], " ")) ic += 1 arch_cv_file.write('%s' % ("\n")) ir += 1 ic = 0 media_xcv = mean(xcv[1:icx]) while ic <= icx : # print CV input (before catalogue) if ic == 0: arch_cv_file.write('%s %s ' % ("*CVinp*", " ")) else: if ic < icx: arch_cv_file.write('%7.4f %s ' % (xcv[ic], " ")) else: arch_cv_file.write('%7.4f %s ' % (media_xcv, " ")) arch_cv_file.write('%6i %s ' % (linea_cv[ic+1], " ")) ic += 1 arch_cv_file.write('%s' % ("\n")) #=========istruzioni aggiunte Roberto Bello 29/02/2012====================== #know_index = str(1.0 - float(cv_media_gen_after) / float(str_med_cv_gen)) #know_index = know_index[0:6] #arch_cv_file.write('%s %s %s' % ('*KIndex* ', know_index, '\n')) #=========fine istruzioni aggiunte da Roberto Bello 29/02/2012============== arch_cv_file.close() # writing out catalog file testata_cat1 = [] testata_cat1.append("*Group*") arch_output_file = open(arch_output, 'w') ic= 0 while ic < icx: testata_cat1.append(testata_cat[ic]) ic += 1 ic= 0 while ic < len(testata_cat1): arch_output_file.write('%s %s ' % (testata_cat1[ic], " "*(15-len(testata_cat1[ic])))) ic += 1 arch_output_file.write('%s' % ("\n")) index = 0 while index < len(arr_orig): riga_orig = arr_orig[index] ic = 0 while ic < len(riga_orig): if not(isinstance(riga_orig[ic],str)): riga_orig[ic] = str(riga_orig[ic]) ic += 1 # place before 0 if gr / sg < 10 gr1 = str(mtchx[index]) if mtchx[index] < 10: gr1 = '0' + str(mtchx[index]) sg1 = str(mtchy[index]) if mtchy[index] < 10: sg1 = '0' + str(mtchy[index]) arr_rig0 = "G_" + gr1 + "_" + sg1 + " "*8 arch_output_file.write('%s ' % (arr_rig0)) ic= 0 while ic < len(riga_orig): arch_output_file.write('%s %s ' % (riga_orig[ic], " "*(15-len(riga_orig[ic])))) ic += 1 arch_output_file.write('%s' % ("\n")) index += 1 testata_cat1 = [] testata_cat1.append("*Group*") testata_cat1.append("*RecNum*") arch_sort_file = open(arch_sort, 'w') ic= 0 while ic < icx: testata_cat1.append(testata_cat[ic]) ic += 1 ic= 0 while ic < len(testata_cat1): arch_sort_file.write('%s %s ' % (testata_cat1[ic], " "*(15-len(testata_cat1[ic])))) ic += 1 arch_sort_file.write('%s' % ("\n")) index = 0 while index < len(arr_grsg_c): riga_grsg = arr_grsg_c[index].split() ic = 0 while ic < len(riga_grsg): val_txt = riga_grsg[ic] val_txt = val_txt[0:13] arch_sort_file.write('%s %s ' % (val_txt, " "*(15-len(val_txt)))) ic += 1 if index < len(arr_grsg_c) - 1: arch_sort_file.write('%s' % ("\n")) index += 1 arch_sort_file.close() # writing out catalog and sorted file arr_outsrt = [] index = 0 while index < len(arr_orig): riga_sort = [] # place before 0 if gr / sg < 10 gr1 = str(mtchx[index]) if mtchx[index] < 10: gr1 = '0' + str(mtchx[index]) sg1 = str(mtchy[index]) if mtchy[index] < 10: sg1 = '0' + str(mtchy[index]) riga_sort.append("G_" + gr1 + "_" + sg1) ic = 0 riga_orig = arr_orig[index] while ic < len(riga_orig): val_riga = riga_orig[ic] riga_sort.append(val_riga) ic += 1 arr_outsrt.append(riga_sort) index += 1 for line in arr_outsrt: line = "".join(line) arr_outsrt.sort() testata_srt = [] testata_srt.append("*Group*") arch_outsrt_file = open(arch_outsrt, 'w') ic= 0 while ic < icx: testata_srt.append(testata_orig[ic]) ic += 1 ic= 0 while ic < len(testata_srt): arch_outsrt_file.write('%s %s' % (testata_srt[ic], " "*(15-len(testata_srt[ic])))) ic += 1 arch_outsrt_file.write('%s' % ("\n")) index = 0 key_gruppo = "" while index < len(arr_outsrt): riga_sort = arr_outsrt[index] index_c = 0 while index_c < len(riga_sort): if index_c == 0: if riga_sort[0] != key_gruppo: # arch_outsrt_file.write('%s ' % ("\n")) key_gruppo = riga_sort[0] valore = riga_sort[index_c] arch_outsrt_file.write('%s %s' % (valore, " "*(15-len(valore)))) index_c += 1 if index < len(arr_grsg_c) - 1: arch_outsrt_file.write('%s' % ("\n")) index += 1 arch_outsrt_file.close() print("###############################################################################") print("# KB_CAT KNOWLEDGE DISCOVERY IN DATA MINING (CATALOG PROGRAM) #") print("# by ROBERTO BELLO (COPYRIGHT MARCH 2011 ALL RIGHTS RESERVED) #") print("# Language used: PYTHON #") print("###############################################################################") arch_log_file = open(arch_log, 'w') arch_log_file.write("%s %s" % ("############################################################################", "\n")) arch_log_file.write("%s %s" % ("# KB_CAT KNOWLEDGE DISCOVERY IN DATA MINING (CATALOG PROGRAM) #", "\n")) arch_log_file.write("%s %s" % ("# by ROBERTO BELLO (COPYRIGHT MARCH 2011 ALL RIGHTS RESERVED) #", "\n")) arch_log_file.write("%s %s" % ("# Language used: PYTHON . #", "\n")) arch_log_file.write("%s %s" % ("############################################################################", "\n")) arch_log_file.write("%s %s %s" % ("Input File -> ", file_input, "\n")) arch_log_file.write("%s %s %s" % ("Numer of Groups (3 - 20) -> ", str(gruppi_num), "\n")) arch_log_file.write("%s %s %s" % ("Normalization (Max, Std, None) -> ", tipo_norm, "\n")) arch_log_file.write("%s %s %s" % ("Start Value of alpha (from 1.8 to 0.9) -> ", str(alpha_max), "\n")) arch_log_file.write("%s %s %s" % ("End Value of alpha (from 0.5 to 0.0001) -> ", str(alpha_min), "\n")) arch_log_file.write("%s %s %s" % ("Decreasing step of alpha (from 0.1 to 0.001) -> ", str(alpha_step), "\n")) arch_log_file.write("%s" % ("=========================OUTPUT=======================================================\n")) arch_log_file.write("%s %s %s" % ("Output File Catalog.original ", arch_output, "\n")) arch_log_file.write("%s %s %s" % ("Output File Catalog.sort ", arch_outsrt, "\n")) arch_log_file.write("%s %s %s" % ("Output File Summary sort ", arch_sort, "\n")) arch_log_file.write("%s %s %s" % ("Output File Matrix Catal. ", arch_catal, "\n")) arch_log_file.write("%s %s %s" % ("Output File Means, STD, CV. ", arch_medsd, "\n")) arch_log_file.write("%s %s %s" % ("Output File CV of the Groups ", arch_cv, "\n")) arch_log_file.write("%s %s %s" % ("Output File Training Grid ", arch_grid, "\n")) arch_log_file.write("%s %s %s" % ("Output File Run Parameters ", arch_log, "\n")) #=========istruzioni aggiunte Roberto Bello 29/02/2012====================== know_index = str(1.0 - float(cv_media_gen_after) / float(str_med_cv_gen)) know_index = know_index[0:6] arch_log_file.write('%s %s %s' % ('*KIndex* ', know_index, '\n')) #=========fine istruzioni aggiunte da Roberto Bello 29/02/2012============== min_err_txt = "%12.3f" % min_err # format 8 integer and 3 decimals alpha_txt = "%12.5f" % alpha # format 6 integer and 5 decimals alpha_min_txt = "%12.5f" % alpha_min # format 6 integer and 5 decimals print if min_err == 1000000000.000: print("Oops! No result. Try again with new alpha parameters") print print ("EPOCH " + str(min_epok -1) + " WITH MIN ERROR " + min_err_txt + " starting alpha " + alpha_min_txt + " ending alpha " + alpha_txt + " Iterations " + str(iter) + " Total Epochs " + str(ne - 1)) print print 'Output File Catalog.original ' + arch_output print 'Output File Catalog.sort ' + arch_outsrt print 'Output File Summary sort ' + arch_sort print 'Output File Matrix Catal. ' + arch_catal print 'Output File Means, STD, CV. ' + arch_medsd print 'Output File CV of the Groups ' + arch_cv print 'Output File Training Grid ' + arch_grid print 'Output File Run Parameters ' + arch_log print 'CV before Catalog ' + str_med_cv_gen print 'CV after Catalog ' + cv_media_gen_after know_index = str(1.0 - float(cv_media_gen_after) / float(str_med_cv_gen)) know_index = know_index[0:6] print 'Knowledge Index ' + know_index print # Elapsed time t1 = datetime.datetime.now() elapsed_time = t1 - t0 print "Elapsed time (seconds) : " + str(elapsed_time.seconds) print