#!/usr/bin/python # -*- encoding:utf-8 -*- """Translate Google's Transcript into srt file. Takes google's transcript filename as argument (xml extension required). NB: to get google's transcript, use tihs URL: http://video.google.com/timedtext?lang=en&v=VIDEO_ID """ # srt example """1 00:00:20,672 --> 00:00:24,972 Entre l’Australia et la South America, dans l’Océan South Pacific…""" # Google's transcript example (first tags) """<?xml version="1.0" encoding="utf-8" ?> <transcript> <text start="11.927" dur="2.483"> This is a matter of National Security.</text>""" import re, sys # Pattern to identify a subtitle and grab start, duration and text. pat = re.compile(r'<?text start="(\d+\.\d+)" dur="(\d+\.\d+)">(.*)</text>?') def parseLine(text): """Parse a subtitle.""" m = re.match(pat, text) if m: return (m.group(1), m.group(2), m.group(3)) else: return None def formatSrtTime(secTime): """Convert a time in seconds (google's transcript) to srt time format.""" sec, micro = str(secTime).split('.') m, s = divmod(int(sec), 60) h, m = divmod(m, 60) return "{:02}:{:02}:{:02},{}".format(h,m,s,micro) def convertHtml(text): """A few HTML encodings replacements. &#39; to ' &quot; to " """ return text.replace('&#39;', "'").replace('&quot;', '"') def printSrtLine(i, elms): """Print a subtitle in srt format.""" return "{}\n{} --> {}\n{}\n\n".format(i, formatSrtTime(elms[0]), formatSrtTime(float(elms[0])+float(elms[1])), convertHtml(elms[2])) fileName = sys.argv[1] def main(fileName): """Parse google's transcript and write the converted data in srt format.""" with open(sys.argv[1], 'r') as infile: buf = [] for line in infile: buf.append(line.rstrip('\n')) # Split the buffer to get one string per tag. buf = "".join(buf).split('><') i = 0 srtfileName = fileName.replace('.xml', '.srt') with open(srtfileName, 'w') as outfile: for text in buf: parsed = parseLine(text) if parsed: i += 1 outfile.write(printSrtLine(i, parsed)) print('DONE ({})'.format(srtfileName)) if __name__ == "__main__": main(fileName)