Commit 555e7239 by manetta

adding text-clean-up script

parent ef9f51d4
from nltk.tokenize import word_tokenize
import re
import string
filename = 'input/mankind-in-the-making.txt'
sentences = []
with open(filename, 'r') as source:
lines = source.readlines()
print lines
for line in lines:
print line
# line = line.decode('utf8')
# wordsline = word_tokenize(line)
# word = word.encode('utf8')
# sentences.append(word)
# regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
# tokenized_reports_no_punctuation = []
# for review in tokenized_reports:
# new_review = []
# for token in review:
# new_token = regex.sub(u'', token)
# if not new_token == u'':
# new_review.append(new_token)
# tokenized_reports_no_punctuation.append(new_review)
# tokenized_reports_no_punctuation
import codecs
with codecs.open("input/mankind-in-the-making_stripped.txt", "w", "utf-8") as destination:
for sentence in sentences:
destination.write(sentence.strip().capitalize()+" ")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment