Commit 14c358be authored by ana's avatar ana

script to extend dictionary

parent 423f2092
# -*- coding: utf-8 -*-
from arbres import arbres
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import fr_core_news_sm
import re
import pickle
dictionnaire1 = set()
print(type(dictionnaire1))
# function to find nouns
def find_nouns(citation):
# set-up Spacy for POS tagging
nlp = fr_core_news_sm.load()
doc = nlp(citation)
# find nouns in quote
nouns = []
for token in doc :
tag = f'{token.text} < {token.pos_}'
if "NOUN" in tag:
# convert Spacy syntax to string
substantif = token.text
nouns.append(substantif)
#print("nouns:", nouns)
return(nouns)
# function to find definition for each noun
def find_definition(noun):
# find definition for each noun in nouns
# define webpage for specific noun
url = "https://www.larousse.fr/dictionnaires/francais"
urlpage = url + '/' + noun
print("urlpage:", urlpage)
# find webpage
page = requests.get(urlpage)
# get content
soup = BeautifulSoup(page.content, 'lxml')
# find specific class of definitions
result = soup.find_all('li', class_="DivisionDefinition")
# select definition 1
if result:
definition = str(result[0])
#print("definition with tag:", definition)
if definition:
# remove example
example = ': <span class="ExempleDefinition">'
if example in definition:
cutdefinition = definition.split(example)
#print("cutdefinition:", cutdefinition)
definition = cutdefinition[0]
# remove html tag
clean_definition = re.sub("<(.+?)>", "", definition)
clean_definition = clean_definition.strip()
clean = noun, clean_definition
#print('word + definition:', nom, clean_definition)
dictionnaire1.add(clean)
return dictionnaire1
# get pickle with dictionary as set
definitions_dump = open('definitions_3rounds.obj', 'rb')
dictionnaire = pickle.load(definitions_dump)
dictionnaire = list(dictionnaire)
print(len(dictionnaire))
# find value of key in dictionary
for element in dictionnaire:
definition = element[1]
nouns = find_nouns(definition)
print("nouns:", nouns)
if nouns:
for noun in nouns:
if noun not in dictionnaire:
dictionnaire1 = find_definition(noun)
print('dictionnaire this round:', dictionnaire1)
print('length dictionnaire this round:', len(dictionnaire1))
# save dictionnaire as pickle
dumping_data = open('definitions_4rounds.obj', 'wb')
pickle.dump(dictionnaire1, dumping_data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment