import sys args = sys.argv # I hard coded these for my local testing. #args = ['self', 'aim.txt', 'wasteland.txt'] # Set to true if you want extra output for debugging. # TK turn this into a command line parameters. verbose = 0 # Set to true if you want to show the original line above the munged one. # TK turn this into a command line parameters. print_original = 0 if verbose: print args if verbose: print 'Take the text from ' + args[1] + ' and model it after ' + args[2] # Pull the filenames from stdin. source_file_name = args[1] model_file_name = args[2] # Open each file. (Error handling would be good here...) source_file = open(source_file_name, 'r') model_file = open(model_file_name, 'r') # Read each line of each file into a list. source_lines = source_file.readlines() model_lines = model_file.readlines() # Removes usernames from the start of a line, e.g. removes "OBRIGADO:" def anonymize(line): if ':' in line: colon_index = line.index(':') + 1 anonymous_line = line[colon_index:len(line)] return anonymous_line.strip() return line # Clean up line breaks. def remove_breaks(line): line = line.replace('\n','') line = line.replace('\r','') return line # Gives index of element containing word. # Less strict than .index(string) since it finds partial matches. def word_at(string, list): index = 0 for item in list: if string in item: return index break index += 1 return -1 # Go through the model and look for matches to the first and last words. index = 0 for line in model_lines: # Make sure it's not a blank line. line = line.strip() # Put in line breaks if it is blank. if len(line) == 0: print '' # Otherwise, start processing. if len(line) > 1: # Place each word in a list. line_list = line.split(' ') first_word = line_list[0]; last_word = line_list[-1]; if verbose: print '------------------------------------' if verbose: print 'Line ' + str(index) + ' starts with "' + first_word + '" ends with "' + last_word + '"' # Find first instance of first word in source file. for first_word_line in source_lines: if first_word in first_word_line: # We found the starting word, now find the ending word. for last_word_line in source_lines: if last_word in last_word_line: # We have both a starting and ending word match! # Clean up, remove line breaks and attribution. # TK problem if match was in name? first_word_line = anonymize(remove_breaks(first_word_line)) last_word_line = anonymize(remove_breaks(last_word_line)) # For the first line, save from the word forward. first_line_list = first_word_line.split(' ') first_word_index = word_at(first_word, first_line_list) first_line_list = first_line_list[first_word_index:len(first_line_list)] # For the last line, save from the word backward. last_line_list = last_word_line.split(' ') last_word_index = word_at(last_word, last_line_list) last_line_list = last_line_list[0:last_word_index + 1] # TK remove blank stuff. complete_line_list = first_line_list + last_line_list if verbose: print complete_line_list # Construct a sentence as close to the original length as possible. model_line_length = len(line_list); # remove words until we have the desired length. # TK single word line problems? while len(complete_line_list) > model_line_length: # Pop from the middle. complete_line_list.pop(int(len(complete_line_list) / 2)) complete_line = ' '.join(complete_line_list) # Print the original above the munged line. if print_original: print line print complete_line # Print add some line breaks for readability. if print_original: print '' break break index += 1