Commit 9b9c9ec3 authored by Zofia Baranczuk's avatar Zofia Baranczuk
Browse files

Corrected usage in README

parent cd414353
......@@ -18,7 +18,7 @@ Requirements:
Python 2.7, nltk, csv, pdfminer, bs4, requests, urllib2
Usage:
> python SysReviewer.py kwashiorkor
> python sysReview.py kwashiorkor
will download papers, which has kwashiorkor in any field. The output will be saved in:
The .pdf files are saved in:
../Output/Output_kwashiorkor_pdf
......@@ -28,7 +28,7 @@ The key phrases are saved as:
../Output/kw_kwashiorkor.csv
> python SysReviewer.py "Malawi AND HIV AND DHS"
> python SysReview.py "Malawi AND HIV AND DHS"
will download the papers which have Malawi, HIV and DHS in any of the fields.
The .pdf files are saved in:
../Output/Output_Malawi+AND+HIV+AND+DHS_pdf
......
import urllib2
from bs4 import BeautifulSoup
import requests
from urllib2 import Request, urlopen
from pyPdf import PdfFileWriter, PdfFileReader
from StringIO import StringIO
import datetime
# create list of dois to add entries into Zotero
# e. prepare bibtex file
def download_file(url, output):
f = open(output, 'wb')
webFile = urllib2.urlopen(url)
f.write(webFile.read())
webFile.close()
f.close()
def process_page(my_url, output_path):
r = requests.get(my_url)
soup_all = BeautifulSoup(r.content, "html.parser")
titles = soup_all.find_all("h4", {"class": "paper-list-title"})
print("titles")
#print(titles[2])#('href')
#print("Titles")
for title in titles:
new_t = title.find("a").get("href")
print(new_t)
url_paper = pap + new_t
html_paper = urllib2.urlopen(url_paper).read()
print("next paper")
print(url_paper)
soup = BeautifulSoup(html_paper, 'html.parser')
pdf_url = soup.find("meta", {"name":"citation_pdf_url"})['content']
#title = soup.find("meta", {"name":"citation_title"})['content']
file_name = new_t.split('/')[-1]
output = output_path + "/" + '_'.join(file_name.split()) + '.pdf'
download_file(pdf_url, output)
pap = "http://paperity.org"
url = "http://paperity.org/search/?q="
url_n = "http://paperity.org/search/"
url_end = "?q="
query = sys.argv[1] # "legionella"
q = url + query
def parse(){
today = datetime.date.today()
html = urllib2.urlopen(q).read()
#print(html)
print(q)
output_dir = "../" + output + str(today) + query
process_page(q, output_dir)
i = 1
while True:
i = i + 1
q_next = url_n + str(i) + url_end + query
print(q_next)
try:
process_page(q_next, "../output_legionella")
except Exception:
exit
}
import os
import glob
import sys
import subprocess
import re
import collections
#import xlsxwriter
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
reload(sys)
sys.setdefaultencoding('utf8')
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
#add proper parsing of arguments
# if ocr needed -- https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/
def remove_non_ascii(text):
text = re.sub(r'[\357\254\200]+', 'ff', text)
text = re.sub(r'[\357\254\201]+', 'fi', text)
text = re.sub(r'[\357\254\202]+', 'fl', text)
text = re.sub('fffi ', 'fi', text)
text = re.sub('ff ', 'ff', text)
text = re.sub('[^0-9a-zA-Z]+', ' ', text)
# text = re.sub(r'[^\x00-\x7F]+',' ', text)
return(text)
def get_toc(pdf_path): #pdf structure of the file
infile = open(pdf_path, 'rb')
parser = PDFParser(infile)
document = PDFDocument(parser)
toc = list()
try:
for (level,title,dest,a,structelem) in document.get_outlines():
print remove_non_ascii(title.strip())
toc = '-'
except PDFNoOutlines:
pass
return toc
# TODO: change input, not order dependent
# parser = argparse.ArgumentParser(description='Description of your program')
#parser.add_argument('-f','--foo', help='Description for foo argument', required=True)
#parser.add_argument('-b','--bar', help='Description for bar argument', required=True)
#args = vars(parser.parse_args())
districts_file_name = sys.argv[1]
stats = [[0 for i in range(50)] for j in range (50)] # change to length
articles_dir = sys.argv[2]
districts_file = open(districts_file_name)
districts = districts_file.read()
districts = districts.split()
for district in districts:
print district
#workbook = xlsxwriter.Workbook('zosienka.xlsx')
#worksheet1 = workbook.add_worksheet() # writing directly into the excel file very slow. Have as data matrix, then write into excel.
col = 1
row = 0
#for district in districts:
# worksheet1.write(row, col, district)
# col = col + 1
dir_files = sys.argv[2]
summary_file = sys.argv[3]
last_file = '0'
#if len(sys.argv)>3:
# last_file = sys.argv[3]
txt_dir = sys.argv[4]
summary = open(summary_file, "w")
os.chdir(dir_files)
for f in glob.glob("*.pdf"):
print(f)
# if f > last_file:
try:
retvalue = convert_pdf_to_txt(f)
retvalue = remove_non_ascii(retvalue)
new_name = f.replace(".pdf", ".txt")
new_name = txt_dir + new_name
print(new_name)
print(os.getcwd())
with open(new_name, "w+") as ff:
ff.write(retvalue)
except:
print(f + " not parsed")
continue
#file.close()
#adxd different words list
summary.write(f)
# col = 0
# row = row + 1
# worksheet1.write(row, col, f)
toc = get_toc(f)
if toc == list():
toc = retvalue.split('.')[0]
#print(toc)
else:
toc = ' '.join(get_toc(f))
#print(toc)
summary.write(toc)
summary.write(" \n")
summary.close()
# retvalue_l = retvalue.lower()
# wordlist = retvalue_l.split()
# wordfreq = []
# for district in districts:
# freq = wordlist.count(district)
# wordfreq.append(freq)
# col = col + 1
# worksheet1.write(row, col, freq)
# print(str(zip(districts, wordfreq))) #add to stats and transpose
# #add wordfreq to excel, districts, wordfreq (or before, in for)
# print(" ")
# print(collections.Counter(retvalue.split()))
# print(" ")
# print(" ")
#workbook.close()
# at the end:
#district 1: 17 papers, dates: 2001-2012
#...
#district 1: papier 1, 12, dates 2001, 2005, 2007, NGO1, NGO2, intervention1, intervention2
# check the title --- more points for word in the title. Don't look for words in Bibliography
import paperity2pdf
import pdf2txt
import txt2key_phrases
query = sys.argv[1]
parse(query)
output_pdf = "../output" + query
convert("../output")
import sys
import re
import os
import collections
import csv
import math
import operator
import glob
import nltk
import itertools
from short_similarity_sujipal import similarity
from sklearn.cluster import DBSCAN
#from textblob import TextBlob as tb
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
#w = sys.argv[1]
lancaster_stemmer = LancasterStemmer()
#a = lancaster_stemmer.stem(w)
from nltk.corpus import wordnet as wn
wordnet_lemmatizer = WordNetLemmatizer()
#wordnet_lemmatizer.lemmatize("dogs")
def remove_non_ascii(text):
text = re.sub(r'[\357\254\200]+', 'ff', text)
text = re.sub(r'[\357\254\201]+', 'fi', text)
text = re.sub(r'[\357\254\202]+', 'fl', text)
text = re.sub('fffi ', 'fi', text)
text = re.sub('fff ', 'f', text)
text = re.sub('fff', 'f', text)
text = re.sub('-\n','', text)
text = re.sub('[^0-9a-zA-Z]+', ' ', text)
# text = re.sub(r'[^\x00-\x7F]+',' ', text)
return(text)
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob.words)
def idf(word, bloblist):
return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
import RAKE
ro = RAKE.Rake("./Malawi_search_words/stopwords_long.txt")
dir_files = sys.argv[2]
glob_keywords_file = {}
glob_files = set()
glob_keyword = set()
#print(glob_files)
print("ten plik")
i=0
for f in glob.glob(dir_files + "/" + "*.txt"):
with open(f, 'r') as content_file:
content = content_file.read()
i = i+1
print i
content = remove_non_ascii(content)
content = " ".join(content.split())
#content = re.sub('- ', '',content)
#content = re.sub(' -', '',content)
#content = re.sub('-', '',content)
mytext = " ".join(content.split())
# words = mytext.split()
# stem_words = [wordnet_lemmatizer.lemmatize(x) for x in words]
# stem2_words = [wordnet_lemmatizer.lemmatize(x,'v') for x in words]
#wordnet_lemmatizer.lemmatize("dogs")
# mytext = " ".join(words)
# change - into ""
# lemmatize all
keywords = ro.run(mytext)
print(f)
short_keywords = [k for (k,v) in keywords if len(k.split()) > 0 and v > 5 and len(k.split())<4]
#import pdb; pdb.set_trace()
paper = os.path.basename(f)
short_kw_dict = {(paper,k):v for (k,v) in keywords if len(k.split())>0 and v > 5 and len(k.split())<4}
#short_kw_list = {k:v for (k,v) in keywords if len(k) < 30 and v > 3}
#for (f,k),v in short_kw.iteritems():
glob_keyword |= set(short_keywords)
glob_files.add(os.path.basename(paper))
glob_keywords_file.update(short_kw_dict)
print(len(glob_keyword))
print(len(glob_files))
list_zeros = [["" for n in xrange(len(glob_files)+3)] for _ in xrange(len(glob_keyword)+1)]
for (y,f) in enumerate(glob_files):
list_zeros[0][y+2]=f
for (x,k) in enumerate(glob_keyword):
list_zeros[x+1][0]=k
if (f, k) in glob_keywords_file:
list_zeros[x+1][y+2] = glob_keywords_file[(f,k)]
# print(glob_keywords_file[(f,k)])
else:
list_zeros[x+1][y+2] = 0
#nltk.cluster.util.cosine_distance("HIV testing","HIV infection")
final_list = []
final_keywords = []
print("ten plik")
for i in xrange(len(list_zeros)):
importance = sum(1 for x in list_zeros[i] if x > 0)
if importance > 6:
final_list.append(list_zeros[i])
final_keywords.append(list_zeros[i][0])
#w = csv.DictWriter( sys.stdout, fields )
#for key,val in sorted(dw.items()):
# row = {'org': key}
# row.update(val)
# w.writerow(row)
#table_keywords = [(k1,k2) for k1 in final_keywords for k2 in final_keywords]
#distance_matrix = [(k1,k2, similarity(k1,k2, True)) for (k1,k2) in table_keywords]
#j=0
#distance_matrix = [[1000 for n in xrange(len(final_keywords))] for k in xrange(len(final_keywords))]
#for i in xrange(1,len(final_keywords)):
# for j in xrange(i+1,len(final_keywords)-1):
# # wn.path_similarity(mother, country)
# distance_matrix[i][j] = 1/(0.00001 + similarity(final_keywords[i], final_keywords[j], True))
# distance_matrix[j][i] = distance_matrix[i][j]#1/(0.00001 + similarity(final_keywords[i], final_keywords[j], True))
# #print(i)
# #print(j)
# #print(final_keywords[i])
# #print(final_keywords[j])
# #print(distance_matrix[i][j])
# #print(distance_matrix[j][i])
# #sorted_kw = sorted(short_kw.items(), key=operator.itemgetter(1), reverse=True)
#db = DBSCAN(eps=3, min_samples=2, metric="precomputed")
#y_db = db.fit_predict(distance_matrix)
#print(len(y_db))
##print(len(final_keywords))
#for i in range(len(y_db)):
# final_list[i][1] = y_db[i]
# # print([final_keywords[i], y_db[i]])
#
kw_file = sys.argv[1]
with open(kw_file, 'w') as kw_output:
#for i in xrange(len(glob_files)):
# kw_output.write(list[k])
#kw_output.write(short_kw)
w = csv.writer(kw_output)
w.writerows(final_list)
# for
###print()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment