#!/usr/bin/env python # -*- coding: cp1252 -*- #figure out where the line breaks are in the file and do something with them. import os import sys import getopt import time import string global g_title global l_tags global g_index_only g_title="" g_encoding="utf-8" g_encoding="cp1252" g_encoding="iso-8859-1" g_index_only=False g_date_folder="" #SVT Nyheter def go_past_ul(s): s_result=s num_uls=0 got_two=False ulpos=1 dlpos=1 nextpos=1 count=len(s) while count>0: count-=1 ulpos=s.find("ul>",ulpos+2) if s[ulpos-1]=="/": num_uls-=1 else: num_uls+=1 if num_uls>=2:got_two=True if ulpos<0:break if got_two==True and num_uls==0: s_result=s[ulpos+3:] break return s_result def doc_title_name(): s_result="" s_month=int(time.strftime("%m")) s_month=month_name(s_month) s_month=s_month.lower() s_result = "Nyheter / Actualités - " + time.strftime("%d") + " " + s_month + " " + time.strftime("%Y") return s_result def date_folder_name(): s_result="" s_month=int(time.strftime("%m")) s_result = os.getcwd() + os.sep + time.strftime("NY%d") + month_name(s_month) + time.strftime("%y") return s_result def month_name(n_month): s_result="" if (n_month==1): s_result="JAN" if (n_month==2): s_result="FEV" if (n_month==3): s_result="MAR" if (n_month==4): s_result="AVR" if (n_month==5): s_result="MAI" if (n_month==6): s_result="JUN" if (n_month==7): s_result="JUL" if (n_month==8): s_result="AOU" if (n_month==9): s_result="SEP" if (n_month==10): s_result="OKT" if (n_month==11): s_result="NOV" if (n_month==12): s_result="DEC" return s_result l_tags=[""] k_style_sheet="_000_style.css" def add_tag_to_globals(s_tag, l_tags): if (s_tag in l_tags)==False: l_tags.append(s_tag) return def write_all_tags(): fw=open("all_tags.txt", "w") i=0 while i=n or i>=len(s) or s[i]==".": b=0 if b!=0: r=r+s[i] i=i+1 return r def process_file(szFilename,iFileNum): global g_title szOutfile="~out.htm" f = open(szFilename, "r") szLine="" s_last_str="" ls="" s="" b_read_file = 1 ft = 0;lt = 0 r="" while b_read_file == 1: szLine=f.readline() szLine=szLine.replace("\n"," ") s=s+szLine ft = f.tell() if lt == ft: b_read_file=0 lt = ft f.close() szOutfile=simplify_filename(szFilename) g_title=get_html_title(s) g_title=replace_accented_chars_with_codes(g_title) if len(g_title)<=4: g_title=szOutfile#"no_title" if s.find("")>=0 or s.find("")>=0 or s.find("=0: return szFilename if g_index_only==True: if os.path.exists(szOutfile)==False: os.rename(szFilename,szOutfile) return szOutfile while s<>s_last_str: s_last_str=s s=remove_ctrl_chars(s) s=s.replace("") s=remove_acirc(s) s=s.replace("","\n") s=s.replace("
","
\n") s=s.replace("
", "
\n") # this puts both header lines for canadian articles at the top. htp=0 stp="" if s.endswith(""): htp=s.rfind("

") if htp>=0: stp=s[htp:] s=s.replace(stp,"") s=stp+s s=s.replace("

","
") s=s.replace("h1>","h2>") s=meta_encoding_string()+s+"\n\n" os.unlink(szFilename) szOutfile=simplify_filename(szFilename) if len(szOutfile)<=8: szOutfile="no_"+szOutfile szOutfile=next_avail_file(szOutfile) # stops it from overwriting files in case they have the same name after simplifying with chars. fw=open(szOutfile, "w") fw.write(s) fw.close() return szOutfile def remove_ctrl_chars(s): r="" ls=len(s) i=0 c="" while i=0: i=i-1 if s[i]==".": j=i break if j>=0: r=s.__getslice__(0,j) return r def remove_end_nums(s): r=s i=len(s) j=-1 while i>=0: i=i-1 if (s[i]>="a" and s[i]<="z"): j=i+1 break if j>=0: r=s.__getslice__(0,j) return r def get_ext(s): r="" i=len(s) j=-1 while i>=0: i=i-1 if s[i]==".": j=i break if j>=0: r=s.__getslice__(j+1,len(s)) return r def next_avail_file(s): r=s i_num=0 while os.path.exists(s)==True: #s_ext=get_ext(s) s_ext="htm" r=remove_ext(s) r=remove_end_nums(r) r=r + "_" + str(i_num) + "." + s_ext i_num=i_num+1 s=r return r def simplify_filename(s): k_maxlen=40 r="" i=0 c="" s_ext="" s_file="" i_len=0 s=deaccent_string(s) for i in range(0,len(s)): c=s[i] c=c.lower() if ((c>="a" and c<="z") or (c>="0" and c<="9")) or c==".": r=r+c else: r=r+"_" while r.find('__')>=0:r=r.replace("__","_") r=replace_accented_chars_with_codes(r) s_file=remove_ext(r) #s_ext=get_ext(r) #s_ext=s_ext.__getslice__(0,3) s_ext="htm" i_len=len(s_file) if i_len>k_maxlen:i_len=k_maxlen s_file=s_file.__getslice__(0,i_len) r=s_file + "." + s_ext return r def deaccent_string(s): s=s.replace("ä","a") s=s.replace("å","a") s=s.replace("à","a") s=s.replace("ö","o") s=s.replace("é","e") s=s.replace("ê","e") s=s.replace("è","e") s=s.replace("ë","e") s=s.replace("î","i") s=s.replace("ù","u") s=s.replace("û","u") s=s.replace("Ä","a") s=s.replace("Å","a") s=s.replace("Ö","o") return s def get_rid_of_useless_tags(s): s=s.replace("","") s=s.replace("
","") s=s.replace("
","") s=s.replace("
", "

") s=s.replace("
","
") s=s.replace("
","
") s=s.replace("

","
") s=s.replace("
","
") s=s.replace("

","
") s=s.replace(" "," ") s=s.replace("
","") s=s.replace("","") s=s.replace("","") s=s.replace("
","
") s=s.replace("
","
") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("
","
") s=s.replace("
","
") s=s.replace("
","
") s=s.replace("
","
") s=s.replace("
","

") s=s.replace("

","
") s=s.replace("

","
") s=s.replace("","") s=s.replace("
","") s=s.replace("
","") s=s.replace("
","") s=s.replace("

  • ","
  • ") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("
    ","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("","") s=s.replace("","") s=s.replace("
    ","") s=s.replace("
    ","") s=s.replace("&quot;","\"") s=s.replace("\xEF\xBB\xBF","") #gets rid of weird symbols at the beginning of files. return s def replace_accented_chars_with_codes(s): # russian s=s.replace("\xD0\x27","B")# D0 =-D s=s.replace("\xD0\xB5","e")# s=s.replace("\xD1\x80","p")# D1 =N~ s=s.replace("\xD0\xBD","H") # normal s=s.replace("\xE2\x89\xA5","≥") s=s.replace("\xE2\x89\xA4","≤") s=s.replace("\xC5\x91","ő") s=s.replace("\xC5\xB1","ű") s=s.replace("\xE2\xAC\xA8","—") s=s.replace("\xEF\xBF\xBD"," ") s=s.replace("\xE2\x99\xA5","♥") s=s.replace("\xE2\x82\xAC","€") s=s.replace("\xE2\x80\x89"," ") s=s.replace("\xE2\x80\xBA",">") s=s.replace("\xE2\x80\xA6","...") s=s.replace("\xE2\x80\xA2","•") s=s.replace("\xE2\x80\x93","–") s=s.replace("\xE2\x80\x94","—") s=s.replace("\xE2\x80\x98","'") s=s.replace("\xE2\x80\x99","'") s=s.replace("\xE2\x80\x9C","\"") s=s.replace("\xE2\x80\x9D","\"") s=s.replace("\xE2\x80","—") # need Z with caron s=s.replace("\xE2\x98","☼") s=s.replace("\xC2\xB0","°") s=s.replace("\xC2\xA0"," ") s=s.replace("\xC2\xAD","­") s=s.replace("\xC2\xA4","•") s=s.replace("\xE2\x96\xA0","•") s=s.replace("\xC2\xAE","®") s=s.replace("\xC2\xBB","»") s=s.replace("\xC2\xAB","«") s=s.replace("\xC2\xB2","²") s=s.replace("\xC2\xB3","³") s=s.replace("\xC2\xB9","¹") s=s.replace("\xC2\xB4","'") s=s.replace("\xC2\xBD","½") s=s.replace("\xC3\xBC","ü") s=s.replace("\xC3\xBA","ú") s=s.replace("\xC3\xA0","à") s=s.replace("\xC3\xA1","á") s=s.replace("\xC3\xA2","â") s=s.replace("\xC3\xA3","ã") s=s.replace("\xC3\xA4","ä") s=s.replace("\xC3\xA5","å") s=s.replace("\xC3\xA6","æ") s=s.replace("\xC3\xA7","ç") s=s.replace("\xC3\xAC","ì") s=s.replace("\xC3\xAD","í") s=s.replace("\xC3\xAE","î") s=s.replace("\xC3\xAF","ï") s=s.replace("\xC3\xB0","ð") s=s.replace("\xC3\xB1","ñ") s=s.replace("\xC3\xB2","ò") s=s.replace("\xC3\xB3","ó") s=s.replace("\xC3\xB5","õ") s=s.replace("\xC3\xB4","ô") s=s.replace("\xC3\xB8","ø") s=s.replace("\xC3\xB6","ö") s=s.replace("\xC3\xB9","ù") s=s.replace("\xC3\xBE","þ") s=s.replace("\xC3\xA9","é") s=s.replace("\xC3\xA8","è") s=s.replace("\xC3\xAA","ê") s=s.replace("\xC3\xAB","ë") s=s.replace("\xC3\xBB","û") s=s.replace("\xC3\xB1","ñ") s=s.replace("\x00\x9C","œ") s=s.replace("\xC2\x9C","œ") s=s.replace("\xC5\x22","œ") s=s.replace("\xC5\x27","Œ") s=s.replace("\xC5\x93","œ") s=s.replace("\xC2\xA9","©") s=s.replace("\xC2\xA3","£") s=s.replace("\xC3\x87","Ç") s=s.replace("\xC3\x84","Ä") s=s.replace("\xC3\x85","Å") s=s.replace("\xC3\x82","Â") s=s.replace("\xC3\x80","À") s=s.replace("\xC3\x81","Á") s=s.replace("\xC3\x88","È") s=s.replace("\xC3\x89","É") s=s.replace("\xC3\x8A","Ê") s=s.replace("\xC3\x8B","Ë") s=s.replace("\xC3\x8C","Ì") s=s.replace("\xC3\x8D","Í") s=s.replace("\xC3\x8E","Î") s=s.replace("\xC3\x8F","Ï") s=s.replace("\xC3\x94","Ô") s=s.replace("\xC3\x96","Ö") s=s.replace("\xC3\x99","Ù") #? s=s.replace("\xC3\x9C","Ü") #? #s=s.replace("\xBF","") s=s.replace("\xC0","À") s=s.replace("\xC7","Ç") s=s.replace("\xC8","È") s=s.replace("\xC9","É") s=s.replace("\xD4","Ô") s=s.replace("\xD9","Ù") s=s.replace("\xDB","Û") s=s.replace("\xE0","à") s=s.replace("\xE2","â") s=s.replace("\xE7","ç") s=s.replace("\xE8","è") s=s.replace("\xE9","é") s=s.replace("\xEA","ê") s=s.replace("\xEB","ë") s=s.replace("\xEE","î") s=s.replace("\xEF","ï") s=s.replace("\xF4","ô") s=s.replace("\xF9","ù") s=s.replace("\xFB","û") s=s.replace("\n"," ") s=s.replace("\r"," ") s=s.replace("\t"," ") s=s.replace("  "," ") s=s.replace("\x80","€") s=s.replace("\x85","...") s=s.replace("\x2D","-") s=s.replace("\u002D","-") s=s.replace("\x92","'") s=s.replace("\x93","\"") s=s.replace("\x94","\"") s=s.replace("\xBB","»") s=s.replace("\xAB","«") s=s.replace("\xA0"," ") s=s.replace("’","\'") s=s.replace("‘","\'") s=s.replace("”","\"") s=s.replace("“","\"") s=s.replace(" \xC3 "," à ") s=s.replace("'\xC3 ","'à ") return s def strip_script(s): r="" cp=0 lbp=1 rbp=0 while lbp>=0: lbp=s.find("=0: while cp",lbp+10) # was lbp+1 if rbp>lbp: cp=rbp+11 else: cp=cp+10 while cp\n" r+="\n" r+="\n" r+="\n" r+="\n" r+="\n" return r def char_code_table(): f = open("charmap.htm", "w") i=32 s="" f.write(meta_encoding_string()) f.write("
  • ") while i<=255: s="" f.write(s) i=i+1 f.write("
    " + hex(i).upper() + "" + "&#" + str(i) + ";
    ") return def get_html_title(s): tp=0 te=0 r="" tp=s.find("") te=s.find("",tp) if tp>0 and te>0 and te>tp: if te-tp<200: r=s[tp+7:te] return r def strip_tags(s): r="" c="" s_tag="" s_last_tag="" tag_char_count=0 cp=0 b_intag=False b_afterspace=False while cp2 and c=="/":c="" if b_afterspace==True and c!=">":c="" s_tag=s_tag+c if c==">" and b_intag==True: b_intag=False if s_tag==inverse_tag(s_last_tag): gt_index=r.rfind("<") if gt_index>=0: r=r[0:gt_index] else: add_tag_to_globals(s_tag, l_tags) r=r+s_tag s_last_tag=s_tag s_tag="" c="" if b_intag==True and b_afterspace==True:c="" if b_intag==False: if c!=" " and c!="":s_last_tag="" r=r+c cp=cp+1 return r def inverse_tag(s): s_result="" if len(s)>1: if s[1]=="/": s_result=s_result.replace("/","") else: s_result="=0: ap=s.find("",ap+3) if ap>0 and rp>ap: s=s[:ap]+s[rp+3:] else: break return s def strip_copyrights(s): r="" ap=0 rp=0 ap=s.find("©") while ap>=0: ap=s.find("©") rp=s.find("<",ap) if ap>0 and rp>ap: s=s[:ap]+s[rp:] else: break return s # gets rid of annoying "Â"s. def remove_acirc(s): r="" ls=len(s) i=0 c="" lc="" b=False bp=False while i="a" and s<="z"): b=True if (s>="A" and s<="Z"): b=True return b # loop through all of the files in the directory def main(): g_date_folder=date_folder_name() num_html_files=0 k_index_file="_000_index.htm" ls=os.listdir(".") ls.sort() r="" s_new_filename="" s_link_name="" i=0 while ik_index_file: num_html_files=num_html_files+1 i=i+1 s_progress="0" i=0 nth_html_file=0 while ik_index_file: nth_html_file=nth_html_file+1 print s_progress + " " + szFilename[:30] s_new_filename="" if os.path.exists(szFilename)==True: s_new_filename=process_file(szFilename,i) if s_new_filename!="": s_link_name=s_new_filename if g_title!="":s_link_name=g_title s_link="" + s_link_name + "" if s_link.find("")>=0:s_link=s_link+"" if s_link.find("")>=0:s_link=s_link+"" s_link=s_link+"
    \n" r=r+s_link #show the progress indicator. if num_html_files>0: s_progress="%.2f" % (float(nth_html_file)/float(num_html_files)*float(100)) else: s_progress="0" curr_time=os.times()[0] last_time=os.times()[0] while curr_time==last_time: last_time=os.times()[0] # progress was here. i=i+1 if nth_html_file>0: if not os.path.exists(g_date_folder): os.mkdir(g_date_folder) r=meta_encoding_string()+""+""+doc_title_name()+""+"
    "+r fw=open(k_index_file, "w") fw.write(r) fw.write("
    ") fw.close() fw=open(g_date_folder + os.sep + k_style_sheet, "w") #fw.write(".specialbar{background-color:maroon;color:white;font-weight:bold;text-align:center;}\n") fw.write("html{font-size:14pt;width:75%;margin:auto;padding:6pt;border:solid black 1px;font-family:Sans}\n") fw.write("td{vertical-align:top;}\n") fw.write("h1{color:navy;font-size:18pt;}\n") fw.write("h2{color:navy;font-size:18pt;}\n") fw.write("a{color:navy;font-size:14pt;}\n") fw.close() else: print "Nothing to do" if True: i=0 ls=os.listdir(".") ls.sort() while i