Commit 898d330f authored by ana's avatar ana

adding word2vec & wikidata project

parent 1b25e996
This diff is collapsed.
This diff is collapsed.
from turtle import *
color('red', 'yellow')
begin_fill()
while True:
forward(200)
left(170)
if abs(pos()) < 1:
break
end_fill()
done()
\ No newline at end of file
from turtle import *
import random
color('green', 'yellow')
begin_fill()
#t = turtle.Pen()
with open("Cocos+Q13099531+Kazakh.txt", 'r') as source:
#with open("Cocos+Q13099531+Russian.txt", 'r') as source:
source=source.readlines()
for line in source:
if '├──' in line:
forward(20)
nr=random.randint(10,90)
right(nr)
elif '└──' in line:
forward(20)
nr=random.randint(10,90)
right(nr)
elif '│ └──' in line:
forward(5)
left(85)
if '?' in line:
up()
nr=random.randint(10,90)
right(nr)
circle(20)
end_fill()
up()
ts = getscreen()
#ts.getcanvas().postscript(file="fully.eps")
ts.getcanvas().postscript(file="questions.eps")
done() # line=line.replace('•', '* ')
# line=line.replace('│ └──', 'tab2 ')
# line=line.replace('├──', 'tab1 ')
# line=line.replace('└──', 'tab1 ')
# if '?' in line:
# print('YES'+line)
# while True:
# forward(200)
# left(170)
# if abs(pos()) < 1:
# break
# end_fill()
# done()
This diff is collapsed.
This diff is collapsed.
<tf.Variable 'Variable:0' shape=(5000, 128) dtype=float32_ref>
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
[[ 0.05207845 -0.05965923 0.10491826 ..., -0.02162105 -0.09924855
0.11219258]
[-0.02037292 -0.12645152 0.02942111 ..., 0.04379232 -0.16621237
0.09704864]
[-0.03017078 -0.23357774 -0.03666852 ..., -0.01781653 -0.05754636
0.1591932 ]
...,
[ 0.09309074 0.10831825 0.01068838 ..., 0.01552744 -0.18842347
0.18424799]
[-0.02866201 0.06726439 -0.09029277 ..., 0.03911865 -0.19248359
-0.04082475]
[ 0.091321 -0.08085828 0.13583764 ..., -0.15024045 -0.12273747
0.12634936]]
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/python -tt
# -*- coding: utf-8 -*-
from nltk.tokenize import word_tokenize
import re
import string
import codecs
import glob
outputfilename = 'data.txt'
input_patt = 'input/*.txt'
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
toremove = ['”', '“', '«', '»']
with codecs.open(outputfilename, "w", "utf-8") as destination:
for path in glob.glob(input_patt):
print('** READING: ', path)
sentences = []
with codecs.open(path, 'r', "utf-8") as source:
lines = source.readlines()
if len(lines) == 1:
lines = lines.split('. ')
print (lines)
for line in lines:
for chr in toremove:
line = line.replace(chr, '')
words = word_tokenize(line)
print (words)
string = []
for word in words:
word = word.strip()
if word:
string.append(word)
sentences.append(' '.join(string))
for sentence in sentences:
destination.write(sentence.strip().lower()+" ")
print ('*text is stripped*')
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment