Commit 2d32b613 authored by Zofia Baranczuk's avatar Zofia Baranczuk
Browse files

Creating the git from Folder

parent ff71a3e0
,zbaran,tl5.math.uzh.ch,17.08.2017 18:56,file:///home/b/zbaran/.config/libreoffice/4;
\ No newline at end of file
import urllib2
from bs4 import BeautifulSoup
import requests
from urllib2 import Request, urlopen
from pyPdf import PdfFileWriter, PdfFileReader
from StringIO import StringIO
import datetime
import os
import errno
# create list of dois to add entries into Zotero
# e. prepare bibtex file
def download_file(url, output):
f = open(output, 'wb')
webFile = urllib2.urlopen(url)
f.write(webFile.read())
webFile.close()
f.close()
def process_page(my_url, output_dir):
pap = "http://paperity.org"
r = requests.get(my_url)
soup_all = BeautifulSoup(r.content, "html.parser")
titles = soup_all.find_all("h4", {"class": "paper-list-title"})
print("titles")
#print(titles[2])#('href')
#print("Titles")
for title in titles:
new_t = title.find("a").get("href")
print(new_t)
url_paper = pap + new_t
html_paper = urllib2.urlopen(url_paper).read()
print("next paper")
print(url_paper)
soup = BeautifulSoup(html_paper, 'html.parser')
pdf_url = soup.find("meta", {"name":"citation_pdf_url"})['content']
#title = soup.find("meta", {"name":"citation_title"})['content']
file_name = new_t.split('/')[-1]
output = output_dir + "/" + '_'.join(file_name.split()) + '.pdf'
download_file(pdf_url, output)
def parse(query, pdf_dir):
pap = "http://paperity.org"
url = "http://paperity.org/search/?q="
url_n = "http://paperity.org/search/"
url_end = "?q="
today = datetime.date.today()
q = url + query
html = urllib2.urlopen(q).read()
print(q)
try:
os.makedirs(pdf_dir)
except OSError as e:
if e.errno != errno.EEXIST:
raise
process_page(q, pdf_dir)
i = 1
while i<51:
i = i + 1
q_next = url_n + str(i) + url_end + query
print(q_next)
try:
process_page(q_next, pdf_dir)
except Exception:
exit
#query = sys.argv[1] # "legionella"
#parse(query)
import urllib2
from bs4 import BeautifulSoup
import requests
from urllib2 import Request, urlopen
from pyPdf import PdfFileWriter, PdfFileReader
from StringIO import StringIO
import datetime
# create list of dois to add entries into Zotero
# e. prepare bibtex file
def download_file(url, output):
f = open(output, 'wb')
webFile = urllib2.urlopen(url)
f.write(webFile.read())
webFile.close()
f.close()
def process_page(my_url, output_path):
r = requests.get(my_url)
soup_all = BeautifulSoup(r.content, "html.parser")
titles = soup_all.find_all("h4", {"class": "paper-list-title"})
print("titles")
#print(titles[2])#('href')
#print("Titles")
for title in titles:
new_t = title.find("a").get("href")
print(new_t)
url_paper = pap + new_t
html_paper = urllib2.urlopen(url_paper).read()
print("next paper")
print(url_paper)
soup = BeautifulSoup(html_paper, 'html.parser')
pdf_url = soup.find("meta", {"name":"citation_pdf_url"})['content']
#title = soup.find("meta", {"name":"citation_title"})['content']
file_name = new_t.split('/')[-1]
output = output_path + "/" + '_'.join(file_name.split()) + '.pdf'
download_file(pdf_url, output)
pap = "http://paperity.org"
url = "http://paperity.org/search/?q="
url_n = "http://paperity.org/search/"
url_end = "?q="
query = sys.argv[1] # "legionella"
q = url + query
def parse(){
today = datetime.date.today()
html = urllib2.urlopen(q).read()
#print(html)
print(q)
output_dir = "../" + output + str(today) + query
process_page(q, output_dir)
i = 1
while True:
i = i + 1
q_next = url_n + str(i) + url_end + query
print(q_next)
try:
process_page(q_next, "../output_legionella")
except Exception:
exit
}
import os
import glob
import sys
import errno
import subprocess
import re
import collections
#import xlsxwriter
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
reload(sys)
sys.setdefaultencoding('utf8')
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
#add proper parsing of arguments
# if ocr needed -- https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/
def remove_non_ascii(text):
text = re.sub(r'[\357\254\200]+', 'ff', text)
text = re.sub(r'[\357\254\201]+', 'fi', text)
text = re.sub(r'[\357\254\202]+', 'fl', text)
text = re.sub('fffi ', 'fi', text)
text = re.sub('ff ', 'ff', text)
text = re.sub('[^0-9a-zA-Z]+', ' ', text)
# text = re.sub(r'[^\x00-\x7F]+',' ', text)
return(text)
def get_toc(pdf_path): #pdf structure of the file
infile = open(pdf_path, 'rb')
parser = PDFParser(infile)
document = PDFDocument(parser)
toc = list()
try:
for (level,title,dest,a,structelem) in document.get_outlines():
print remove_non_ascii(title.strip())
toc = '-'
except PDFNoOutlines:
pass
return toc
# TODO: change input, not order dependent
# parser = argparse.ArgumentParser(description='Description of your program')
#parser.add_argument('-f','--foo', help='Description for foo argument', required=True)
#parser.add_argument('-b','--bar', help='Description for bar argument', required=True)
#args = vars(parser.parse_args())
###pdf_dir = sys.argv[1]
###summary_file = txt_dir + "/summary.txt" #sys.argv[3]
#last_file = '0'
###txt_dir = sys.argv[2]
def convert(pdf_dir, txt_dir, summary_file):
summary = open(summary_file, "w")
try:
os.makedirs(txt_dir)
except OSError as e:
if e.errno != errno.EEXIST:
raise
#os.chdir(pdf_dir)
for f in glob.glob(pdf_dir + "/" + "*.pdf"):
print(f)
# if f > last_file:
try:
retvalue = convert_pdf_to_txt(f)
retvalue = remove_non_ascii(retvalue)
file_name = os.path.basename(f)
txt_name = file_name.replace(".pdf", ".txt")
txt_path = txt_dir +"/"+ txt_name
print(txt_path)
print(os.getcwd())
with open(txt_path, "w+") as ff:
ff.write(retvalue)
except:
print(f + " not parsed")
continue
summary.write(f)
toc = get_toc(f)
if toc == list():
toc = retvalue.split('.')[0]
#print(toc)
else:
toc = ' '.join(get_toc(f))
#print(toc)
summary.write(toc)
summary.write(" \n")
summary.close()
# check the title --- more points for word in the title. Don't look for words in Bibliography
import os
import glob
import sys
import subprocess
import re
import collections
#import xlsxwriter
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
reload(sys)
sys.setdefaultencoding('utf8')
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
#add proper parsing of arguments
# if ocr needed -- https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/
def remove_non_ascii(text):
text = re.sub(r'[\357\254\200]+', 'ff', text)
text = re.sub(r'[\357\254\201]+', 'fi', text)
text = re.sub(r'[\357\254\202]+', 'fl', text)
text = re.sub('fffi ', 'fi', text)
text = re.sub('ff ', 'ff', text)
text = re.sub('[^0-9a-zA-Z]+', ' ', text)
# text = re.sub(r'[^\x00-\x7F]+',' ', text)
return(text)
def get_toc(pdf_path): #pdf structure of the file
infile = open(pdf_path, 'rb')
parser = PDFParser(infile)
document = PDFDocument(parser)
toc = list()
try:
for (level,title,dest,a,structelem) in document.get_outlines():
print remove_non_ascii(title.strip())
toc = '-'
except PDFNoOutlines:
pass
return toc
# TODO: change input, not order dependent
# parser = argparse.ArgumentParser(description='Description of your program')
#parser.add_argument('-f','--foo', help='Description for foo argument', required=True)
#parser.add_argument('-b','--bar', help='Description for bar argument', required=True)
#args = vars(parser.parse_args())
districts_file_name = sys.argv[1]
stats = [[0 for i in range(50)] for j in range (50)] # change to length
articles_dir = sys.argv[2]
districts_file = open(districts_file_name)
districts = districts_file.read()
districts = districts.split()
for district in districts:
print district
#workbook = xlsxwriter.Workbook('zosienka.xlsx')
#worksheet1 = workbook.add_worksheet() # writing directly into the excel file very slow. Have as data matrix, then write into excel.
col = 1
row = 0
#for district in districts:
# worksheet1.write(row, col, district)
# col = col + 1
dir_files = sys.argv[2]
summary_file = sys.argv[3]
last_file = '0'
#if len(sys.argv)>3:
# last_file = sys.argv[3]
txt_dir = sys.argv[4]
summary = open(summary_file, "w")
os.chdir(dir_files)
for f in glob.glob("*.pdf"):
print(f)
# if f > last_file:
try:
retvalue = convert_pdf_to_txt(f)
retvalue = remove_non_ascii(retvalue)
new_name = f.replace(".pdf", ".txt")
new_name = txt_dir + new_name
print(new_name)
print(os.getcwd())
with open(new_name, "w+") as ff:
ff.write(retvalue)
except:
print(f + " not parsed")
continue
#file.close()
#adxd different words list
summary.write(f)
# col = 0
# row = row + 1
# worksheet1.write(row, col, f)
toc = get_toc(f)
if toc == list():
toc = retvalue.split('.')[0]
#print(toc)
else:
toc = ' '.join(get_toc(f))
#print(toc)
summary.write(toc)
summary.write(" \n")
summary.close()
# retvalue_l = retvalue.lower()
# wordlist = retvalue_l.split()
# wordfreq = []
# for district in districts:
# freq = wordlist.count(district)
# wordfreq.append(freq)
# col = col + 1
# worksheet1.write(row, col, freq)
# print(str(zip(districts, wordfreq))) #add to stats and transpose
# #add wordfreq to excel, districts, wordfreq (or before, in for)
# print(" ")
# print(collections.Counter(retvalue.split()))
# print(" ")
# print(" ")
#workbook.close()
# at the end:
#district 1: 17 papers, dates: 2001-2012
#...
#district 1: papier 1, 12, dates 2001, 2005, 2007, NGO1, NGO2, intervention1, intervention2
# check the title --- more points for word in the title. Don't look for words in Bibliography
# Python/NLTK implementation of algorithm to detect similarity between
# short sentences described in the paper - "Sentence Similarity based
# on Semantic Nets and Corpus Statistics" by Li, et al.
# Results achieved are NOT identical to that reported in the paper, but
# this is very likely due to the differences in the way the algorithm was
# described in the paper and how I implemented it.
from __future__ import division
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
import math
import numpy as np
import sys
# Parameters to the algorithm. Currently set to values that was reported
# in the paper to produce "best" results.
ALPHA = 0.2
BETA = 0.45
ETA = 0.4
PHI = 0.2
DELTA = 0.85
brown_freqs = dict()
N = 0
######################### word similarity ##########################
def get_best_synset_pair(word_1, word_2):
"""
Choose the pair with highest path similarity among all pairs.
Mimics pattern-seeking behavior of humans.
"""
max_sim = -1.0
synsets_1 = wn.synsets(word_1)
synsets_2 = wn.synsets(word_2)
if len(synsets_1) == 0 or len(synsets_2) == 0:
return None, None
else:
#max_sim = -1.0
#best_pair = None, None
#for synset_1 in synsets_1:
# for synset_2 in synsets_2:
# sim = wn.path_similarity(synset_1, synset_2)
# if sim > max_sim:
# max_sim = sim
# best_pair = synset_1, synset_2
return synsets_1[0], synsets_2[0]
# return best_pair
def length_dist(synset_1, synset_2):
"""
Return a measure of the length of the shortest path in the semantic
ontology (Wordnet in our case as well as the paper's) between two
synsets.
"""
l_dist = sys.maxint
if synset_1 is None or synset_2 is None:
return 100.0
if synset_1 == synset_2:
# if synset_1 and synset_2 are the same synset return 0
l_dist = 0.0
else:
wset_1 = set([str(x.name()) for x in synset_1.lemmas()])
wset_2 = set([str(x.name()) for x in synset_2.lemmas()])
if len(wset_1.intersection(wset_2)) > 0:
# if synset_1 != synset_2 but there is word overlap, return 1.0
l_dist = 0.0
else:
# just compute the shortest path between the two
l_dist = synset_1.shortest_path_distance(synset_2)
if l_dist is None:
l_dist = 100.0
# normalize path length to the range [0,1]
return math.exp(-ALPHA * l_dist)
#nyms = ['hypernyms', 'hyponyms', 'meronyms', 'holonyms', 'part_meronyms', 'sisterm_terms', 'troponyms', 'inherited_hypernyms']
def hierarchy_dist(synset_1, synset_2):
"""
Return a measure of depth in the ontology to model the fact that
nodes closer to the root are broader and have less semantic similarity
than nodes further away from the root.
"""
h_dist = sys.maxint
if synset_1 is None or synset_2 is None:
return h_dist
if synset_1 == synset_2:
# return the depth of one of synset_1 or synset_2
h_dist = max([x[1] for x in synset_1.hypernym_distances()])
else:
# find the max depth of least common subsumer
hypernyms_1 = {x[0]:x[1] for x in synset_1.hypernym_distances()}
hypernyms_2 = {x[0]:x[1] for x in synset_2.hypernym_distances()}
lcs_candidates = set(hypernyms_1.keys()).intersection(
set(hypernyms_2.keys()))
if len(lcs_candidates) > 0:
lcs_dists = []
for lcs_candidate in lcs_candidates:
lcs_d1 = 0
if hypernyms_1.has_key(lcs_candidate):
lcs_d1 = hypernyms_1[lcs_candidate]
lcs_d2 = 0
if hypernyms_2.has_key(lcs_candidate):
lcs_d2 = hypernyms_2[lcs_candidate]
lcs_dists.append(max([lcs_d1, lcs_d2]))
h_dist = max(lcs_dists)
else:
h_dist = 10000
return ((math.exp(BETA * h_dist) - math.exp(-BETA * h_dist)) /
(math.exp(BETA * h_dist) + math.exp(-BETA * h_dist)))
def word_similarity(word_1, word_2):
synset_pair = get_best_synset_pair(word_1, word_2)
return (length_dist(synset_pair[0], synset_pair[1]) * hierarchy_dist(synset_pair[0], synset_pair[1]))
######################### sentence similarity ##########################
def most_similar_word(word, word_set):
"""
Find the word in the joint word set that is most similar to the word
passed in. We use the algorithm above to compute word similarity between
the word and each word in the joint word set, and return the most similar
word and the actual similarity value.
"""
max_sim = -1.0
sim_word = ""
for ref_word in word_set:
sim = word_similarity(word, ref_word)
if sim > max_sim:
max_sim = sim
sim_word = ref_word
return sim_word, max_sim
def info_content(lookup_word):
"""
Uses the Brown corpus available in NLTK to calculate a Laplace
smoothed frequency distribution of words, then uses this information
to compute the information content of the lookup_word.
"""
global N
if N == 0:
# poor man's lazy evaluation
for sent in brown.sents():
for word in sent:
word = word.lower()
if not brown_freqs.has_key(word):
brown_freqs[word] = 0
brown_freqs[word] = brown_freqs[word] + 1
N = N + 1
lookup_word = lookup_word.lower()
n = 0 if not brown_freqs.has_key(lookup_word) else brown_freqs[lookup_word]
return 1.0 - (math.log(n + 1) / math.log(N + 1))
def semantic_vector(words, joint_words, info_content_norm):
"""
Computes the semantic vector of a sentence. The sentence is passed in as
a collection of words. The size of the semantic vector is the same as the
size of the joint word set. The elements are 1 if a word in the sentence
already exists in the joint word set, or the similarity of the word to the
most similar word in the joint word set if it doesn't. Both values are
further normalized by the word's (and similar word's) information content
if info_content_norm is True.
"""
sent_set = set(words)
semvec = np.zeros(len(joint_words))
i = 0
for joint_word in joint_words:
if joint_word in sent_set:
# if word in union exists in the sentence, s(i) = 1 (unnormalized)
semvec[i] = 1.0
if info_content_norm:
semvec[i] = semvec[i] * math.pow(info_content(joint_word), 2)
else:
# find the most similar word in the joint set and set the sim value
sim_word, max_sim = most_similar_word(joint_word, sent_set)
semvec[i] = PHI if max_sim > PHI else 0.0
if info_content_norm:
semvec[i] = semvec[i] * info_content(joint_word) * info_content(sim_word)
i = i + 1
return semvec
def semantic_similarity(sentence_1, sentence_2, info_content_norm):
"""
Computes the semantic similarity between two sentences as the cosine
similarity between the semantic vectors computed for each sentence.
"""
words_1 = nltk.word_tokenize(sentence_1)
words_2 = nltk.word_tokenize(sentence_2)
joint_words = set(words_1).union(set(words_2))
vec_1 = semantic_vector(words_1, joint_words, info_content_norm)
vec_2 = semantic_vector(words_2, joint_words, info_content_norm)
return np.dot(vec_1, vec_2.T) / (0.00001 + np.linalg.norm(vec_1) * np.linalg.norm(vec_2))
######################### word order similarity ##########################