# -*- coding: iso-8859-1 -*-
"""Generate a text with words looking like those in a given text,
based on the frequency of character sequences
"""
import string
import io
import random
class TextGenerator:
def __init__(self,txt,seq_len=5):
"""txt = original text
seq_len = sequence length ; 3 to 6 give the best results"""
# dictionary mapping sequences of seq_len chararcters to the list
# of characters following them in the original text
self.followers = {}
for i in range(len(txt)-2*seq_len):
sequence = txt[i:i+seq_len] # sequence of seq_len characters
next_char = txt[i+seq_len] # the character following this sequence
if sequence in self.followers:
self.followers[sequence].append(next_char)
else:
self.followers[sequence]=[next_char]
# sequences that start with an uppercase letter
starts = [ key for key in self.followers
if key[0] in string.ascii_uppercase ]
if not starts: # just in case...
starts = list(self.followers.keys())
# build a distribution of these sequences with the same frequency
# as in the original text
self.starts = []
for key in starts:
for i in range(len(self.followers[key])):
self.starts.append(key)
def random_text(self,length=5000):
"""length = length of the generated text"""
# pick a start at random and initialize
# generated text with this sequence
sequence = random.choice(self.starts)
gen_text = io.StringIO()
gen_text.write(sequence)
for j in range(length):
# pick a character among those following current sequence
next_char = random.choice(self.followers[sequence])
gen_text.write(next_char)
sequence = sequence[1:]+next_char
return gen_text.getvalue()
if __name__=="__main__":
import re
txt = open('hamlet.txt').read()
txt = re.sub("\n+",'\n',txt)
gen = TextGenerator(txt)
res = gen.random_text(3000)
out = open('result.txt','w')
out.write(res)
out.close()
Diff to Previous Revision
--- revision 3 2011-12-18 17:56:03
+++ revision 4 2011-12-25 11:14:39
@@ -12,26 +12,22 @@
def __init__(self,txt,seq_len=5):
"""txt = original text
seq_len = sequence length ; 3 to 6 give the best results"""
- self.seq_len = seq_len
- # dictionary mapping a n-char sequence to the list of characters
- # following this sequence in the original text
+ # dictionary mapping sequences of seq_len chararcters to the list
+ # of characters following them in the original text
self.followers = {}
for i in range(len(txt)-2*seq_len):
- sequence = txt[i:i+seq_len] # n-character sequence
+ sequence = txt[i:i+seq_len] # sequence of seq_len characters
next_char = txt[i+seq_len] # the character following this sequence
if sequence in self.followers:
self.followers[sequence].append(next_char)
else:
self.followers[sequence]=[next_char]
- sequences = list(self.followers.keys())
- sequences.sort()
-
# sequences that start with an uppercase letter
- starts = [ key for key in sequences
+ starts = [ key for key in self.followers
if key[0] in string.ascii_uppercase ]
if not starts: # just in case...
- starts = sequences
+ starts = list(self.followers.keys())
# build a distribution of these sequences with the same frequency
# as in the original text
@@ -64,4 +60,3 @@
out = open('result.txt','w')
out.write(res)
out.close()
-