Commit 698747e6 authored by manetta's avatar manetta

adding the i-could-have-written-that writing-systems to the git

parent 5fbe8c5b
# -*- coding: utf-8 -*-
# this script extracts sentences from xml, saves them with certain/uncertain tag in a list
# it tags questions as uncertainty sentences
# it saves the sentences only in separate certain/uncertain lists
# it creates a lemma dicitonary of the speculation cues
import cPickle
import io
import json
from bs4 import BeautifulSoup
from pattern.en import parse
from nltk import word_tokenize
import random
def seed():
"""
Returns the seed to be used in a random function
kiest random item uit lijst, idee van 'seed' om repliceerbaar onderzoek: randomise wordt herhaald ahv brongetal
"""
return 0.47231099848
# hopen dat deze met bigrams & trigrams opgepikt worden?
# possibility, hypothesis, possible, question, investigate, indicate, whether
# I added 'or' and all negations (part of cues)
multicues = ["raises the intriguing possibility", "remains to be elucidated", "raise the possibility",
"raised the possibility", "raising the possibility", "raise the hypothesis", "advance the hypothesis",
"not possible to ascertain", "address a number of questions", "cannot exclude the possibility",
"no clear evidence", "not clearly delineated", "not been clearly elucidated", "raises an interesting question",
"addressed the question of", "cannot rule out the possibility", "remains to be investigated", "indicates that",
"indicating that", "indicate that", "indicated that", "lends strong support", "yet to be understood",
"not fully understood", "increasing evidence", "compelling evidence", "cannot be excluded", "cannot exclude",
"remains to be determined", "raises the question", "raises the possibility", "raises questions", "can be deduced", "refer to",
"not known", "whether or not", "or", "no guarantee", "no evidence", "not evident", "not clear", "no proof","not", "no",
"issue is raised", "examined for", "consistent with", "no evidence of", "not exclude", "looks as", "prone to", "to confirm"]
#print(len(multicues)) #52
# allows to reduce this list + some could be picked up using synonyms
# looks like rest are negated unhedgers
multicues_min = ["remains to be elucidated", "no clear evidence", "not clearly delineated", "not been clearly elucidated",
"lends strong support", "yet to be understood", "not fully understood", "increasing evidence", "compelling evidence",
"cannot be excluded", "cannot exclude", "remains to be determined", "can be deduced", "not known", "or",
"no guarantee", "no evidence", "not evident", "not clear", "no proof","not", "no", "issue is raised",
"examined for", "consistent with", "no evidence of", "not exclude", "looks as", "prone to", "to confirm"]
#print(len(multicues_min)) #30
# import documents
## full Bioscope corpus
#corpus = io.open('../1_extracting_the_corpus/full_papers.xml','r', encoding='utf-8').read()
corpus = io.open('combined.xml','r', encoding='utf-8').read()
#corpus = open('abstracts.xml').read()
#corpus = io.open('../1_extracting_the_corpus/full_papers_extract_10.xml', 'r', encoding='utf-8').read()
#corpus = codecs.open('combined.xml', 'r', 'utf-8').read()
## extract
#corpus = open('full_papers_extract_10.xml').read()
# create BeautifulSoup object
soup = BeautifulSoup(corpus, "html.parser")
#print(type(soup))
#print(soup.prettify())
# select sentence objects from xml
sentences = soup.find_all("sentence")
# get strings, cues, tags, text of tags
collection = []
uncertainty = []
certainty = []
lemma_lexicon = set()
print("Sentences found: ", len(sentences))
for key, sentence in enumerate(sentences):
print("Extracting sentence ", key+1)
# this gets the string
sentence_text = sentence.text
# checks if sentence is a question
if sentence_text[-1] == "?":
# select the sentence, add class
pair = (sentence_text, "uncertain")
# add pair to collection
collection.append(pair)
# add pair to uncertainty phrases
uncertainty.append(pair)
else:
# this gets the cue
cues = sentence.find_all("cue")
if cues:
sentence_spec_tag = 0
sentence_neg_tag = 0
for cue in cues:
# finds modifying word
cue_text = cue.text.lower()
#print("cue text:", cue_text)
# finds "speculation" or "negation"
cue_tag = cue["type"]
if cue_tag == "speculation":
sentence_spec_tag += 1
# find POS of cue_text in order to lemmatize
# tokenize string
# parse sentence with token, POS, lemma
if cue_text not in multicues:
par = parse(cue_text,
tokenize = True, # Split punctuation marks from words?
tags = False, # Parse part-of-speech tags? (NN, JJ, ...)
chunks = False, # Parse chunks? (NP, VP, PNP, ...)
relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...)
lemmata = True, # Parse lemmata? (ate => eat)
encoding = 'utf-8' # Input string encoding
)
words = par.split("/")
lemma = words[-1]
lemma_lexicon.add(lemma)
else:
sentence_neg_tag += 1
if sentence_spec_tag > 0:
# select the sentence, add class
pair = [sentence_text, "uncertain"]
# add pair to collection
collection.append(pair)
# add pair to uncertainty phrases
uncertainty.append(pair)
else:
# add negative sentence as certain
pair = [sentence_text, "certain"]
collection.append(pair)
# add to certain collection
certainty.append(pair)
# no cues in text, add sentence as certain
else:
pair = [sentence_text, "certain"]
collection.append(pair)
certainty.append(pair)
#uncertainty_equalize = uncertainty[:4888]
uncertainty_equalize = uncertainty[:200]
print(uncertainty_equalize[:10])
#certainty_equalize = certainty[:4888]
certainty_equalize = certainty[:200]
print(certainty_equalize[:10])
collection_equalize = certainty_equalize + uncertainty_equalize
print(len(collection_equalize))
print(collection_equalize[:20])
random.shuffle(collection_equalize,seed)
print(collection_equalize[:20])
# print("collection:", collection[:10])
# print("certainty:", certainty[:10])
# print("uncertainty:", uncertainty[:10])
# print("lemma lexicon:", lemma_lexicon)
# print("********************")
# print
print("uncertainty length list:", len(uncertainty)) #2454
print("certainty length list:", len(certainty)) #12097
print("********************")
print
print("length sentences objects:", len(sentences))
print("length extracted collection:", len(collection))
# # #print("length equalized collection:", len(collection_equalize))
print "type sentence:", type(sentence_text)
# with io.open("lemma_lexicon.txt", "w", encoding="utf-8") as destination:
# destination.write('\n'.join(lemma_lexicon))
# https://pythontips.com/2013/08/08/storing-and-loading-data-with-json/
# with io.open("collection.json", "w", encoding="utf-8") as destination:
# # print "final collection", collection
# data = json.dumps(collection, ensure_ascii=False, encoding="utf-8")
# if type(data) is str:
# data = data.decode('utf-8')
# destination.write(data)
# # # # save as pickle
# cPickle.dump(collection, open("bioscope_full_papers_2670.p", "wb", 2))
#cPickle.dump(collection, open("bioscope_combined_14541.p", "wb", 2))
#pickle.dump(collection, open("bioscope_combined_14541.p", "wb"))
#cPickle.dump(collection_equalize, open("bioscope_combined_equalized_4888.p", "wb", 2))
# pickle.dump(lemma_lexicon, open("bioscope_lemma_lexicon.p", "wb", protocol=2))
#cPickle.dump(collection, open("collection.p", "wb"))
cPickle.dump(collection_equalize, open("bioscope_equalized_200.p", "wb"))
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment