Commit 80b9da0d by manetta

pushing a working text-cleaner

parent 555e7239
Showing with 22 additions and 28 deletions
#!/usr/bin/python -tt
# -*- coding: utf-8 -*-
from nltk.tokenize import word_tokenize
import re
import string
import codecs
filename = 'input/mankind-in-the-making.txt'
sentences = []
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
with open(filename, 'r') as source:
lines = source.readlines()
print lines
lines = source.readlines()[0]
lines = lines.split('. ')
# print lines
for line in lines:
print line
# line = line.decode('utf8')
# wordsline = word_tokenize(line)
# word = word.encode('utf8')
# sentences.append(word)
# regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
# tokenized_reports_no_punctuation = []
# for review in tokenized_reports:
# new_review = []
# for token in review:
# new_token = regex.sub(u'', token)
# if not new_token == u'':
# new_review.append(new_token)
# tokenized_reports_no_punctuation.append(new_review)
# tokenized_reports_no_punctuation
import codecs
# print line
if " ”" in line:
line.replace(" ”","”")
line = line.decode('utf8')
words = word_tokenize(line)
print words
string = []
for word in words:
new_word = regex.sub(u'', word)
if not new_word == u'':
string.append(new_word)
string = ' '.join(string)
sentences.append(string)
with codecs.open("input/mankind-in-the-making_stripped.txt", "w", "utf-8") as destination:
for sentence in sentences:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment