Commit 033fb2ae by Sartika Aritonang

Upload proximity.py

parent 97bf6435
import re
import math
import string
import xml.dom.minidom as minidom
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
def parse_xml():
news_collection = minidom.parse("data/news.xml")
news_id = news_collection.getElementsByTagName('ID')
news_source = news_collection.getElementsByTagName('SOURCE')
news_link = news_collection.getElementsByTagName('LINK')
news_title = news_collection.getElementsByTagName('TITLE')
news_author = news_collection.getElementsByTagName('AUTHOR')
news_datetime = news_collection.getElementsByTagName('DATETIME')
news_paragraph = news_collection.getElementsByTagName('PARAGRAPH')
N_news = len(news_id)
id_in_news =[]
sentence_in_source = []
sentence_in_link = []
sentence_in_title = []
sentence_in_author = []
sentence_in_datetime = []
sentence_in_news = []
for i in range(N_news):
ids=news_id[i].firstChild.data
id_in_news.append(ids)
for i in range(N_news):
sentences = news_source[i].firstChild.data
sentence_in_source.append(sentences)
for i in range(N_news):
sentences = news_link[i].firstChild.data
sentence_in_link.append(sentences)
for i in range(N_news):
sentences = news_title[i].firstChild.data
sentence_in_title.append(sentences)
for i in range(N_news):
sentences = news_author[i].firstChild.data
sentence_in_author.append(sentences)
for i in range(N_news):
sentences = news_datetime[i].firstChild.data
sentence_in_datetime.append(sentences)
for i in range(N_news):
sentences = news_paragraph[i].firstChild.data
sentence_in_news.append(sentences)
return ({'id_in_news': id_in_news, 'sentence_in_source' : sentence_in_source, 'sentence_in_link' : sentence_in_link, 'sentence_in_title' : sentence_in_title,
'sentence_in_author' : sentence_in_author, 'sentence_in_datetime' : sentence_in_datetime,
'sentence_in_news': sentence_in_news})
def removePunctuation(textList):
for i in range(len(textList)):
for punct in string.punctuation:
textList[i] = textList[i].replace(punct, " ")
textList[i] = re.sub(r'^https?:\/\/.*[\r\n]*', '', textList[i], flags=re.MULTILINE)
textList[i] = re.sub(r'“', '', textList[i])
textList[i] = re.sub(r'”', '', textList[i])
return textList
def token(sentence):
token = []
for word in CountVectorizer().build_tokenizer()(sentence):
token.append(word)
return token
def tokenize(textList):
tokens = []
for i in range(len(textList)):
tokens.append(token(textList[i]))
return tokens
def caseFolding(textList):
text = []
for i in range(len(textList)):
text.append(textList[i].lower())
return text
def get_token(file):
#file = parse_xml()
content = removePunctuation(file['sentence_in_news'])
title = removePunctuation(file['sentence_in_title'])
contents = caseFolding(content)
titles = caseFolding(title)
token_contents = tokenize(contents)
token_titles = tokenize(titles)
token = []
for i in token_titles:
token.append(i)
for j in token_contents:
token.append(j)
return token
def checkStopword(sentence, stop_words):
sentence = [w for w in sentence if not w in stop_words]
return sentence
def stopwordRemove(textList):
with open("data/id.stopwords.02.01.2016.txt", "r") as fd:
stopwords = fd.read().splitlines()
stop_words = set(stopwords)
text = []
for i in range(len(textList)):
text.append(checkStopword(textList[i], stop_words))
return text
def numberRemove(textList):
text = []
for i in range(len(textList)):
text.append([w for w in textList[i] if not any(j.isdigit() for j in w)])
return text
def stemming(textList):
factory = StemmerFactory()
stemmer = factory.create_stemmer()
text = textList
for i in range(len(textList)):
for j in range(len(textList[i])):
text[i][j] = stemmer.stem(text[i][j])
return text
def getAllTerms(textList):
terms = []
for i in range(len(textList)):
for j in range(len(textList[i])):
terms.append(textList[i][j])
return sorted(set(terms))
def createIndex(textList):
#file = parse_xml()
#token = get_token()
#tokenize = stopwordRemove(token)
#tokenize = numberRemove(tokenize)
#textList = stemming(tokenize)
terms = getAllTerms(textList)
proximity = {}
for term in terms:
position = {}
for n in range(len(textList)):
if(term in textList[n]):
position[(file['id_in_news']*2)[n]] = []
for i in range(len(textList[n])):
if(term == textList[n][i]):
position[(file['id_in_news']*2)[n]].append(i)
proximity[term] = position
return proximity
def save_indexing():
indexing = createIndex()
file = open('index.txt', 'w')
file.write(str(indexing))
file.close()
# save_indexing()
def open_indexing():
with open("data/index.txt", "r") as fd:
fi = fd.read()
index = eval(fi)
return index
def removePunctuationQuery(textList):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
for x in textList:
if x in punctuations:
textList = textList.replace(x, "")
return textList
def queryPreprocessing(query):
terms=[]
query = removePunctuationQuery(query)
querys = []
querys.append(query)
#querys = caseFolding(querys)
for i in range(len(querys)):
querys[i]=''.join([i for i in querys[i] if not i.isdigit()])
querys[i]=re.sub(r'^https?:\/\/.*[\r\n]*','', querys[i], flags=re.MULTILINE)
terms.append(word_tokenize(querys[i]))
terms = numberRemove(terms)
terms = stopwordRemove(terms)
terms = stemming(terms)
return terms
def queryInIndex(query, index):
result = []
for word in query:
if word in index:
result.append(word)
return result
def df(query, index):
docFreq = {}
for word in query:
if word in index:
docFreq[word] = len(index[word])
return docFreq
def idf(df, N):
inv = {}
for word in df:
inv[word] = math.log10(N/df[word])
return inv
def tf(query, index):
termFreq = {}
for word in query:
freq = {}
if word in index:
for i in index[word]:
freq[i] = len(index[word][i])
termFreq[word] = freq
return termFreq
def tfidf(tf, idf):
w = {}
for word in tf:
wtd = {}
for doc in tf[word]:
wtd[doc] = (1+(math.log10(tf[word][doc])))*idf[word]
w[word] = wtd
return w
def score(TFIDF):
res = {}
for i in TFIDF:
for j in TFIDF[i]:
res[j] = 0
for i in TFIDF:
for j in TFIDF[i]:
res[j] = res[j]+TFIDF[i][j]
sorted_dict = sorted(res, key=res.get, reverse=True)
return ({'sorted_dict': sorted_dict, 'res': res})
def results(query):
querys = []
querys.append(query)
file = parse_xml()
with open("data/index.txt", "r") as fd:
fi = fd.read()
index = eval(fi)
terms = queryPreprocessing(querys)
querys = terms[0]
querys = queryInIndex(querys, index)
N = len(file['id_in_news'])
tfidf_list = []
docFrequency = df(querys, index)
invDocFrequency = idf(docFrequency, N)
termFrequency = tf(querys, index)
TFIDF = tfidf(termFrequency, invDocFrequency)
sc = score(TFIDF)
relevanceDocNumber = []
count = 0
result = []
process = []
for i in range(len(sc['sorted_dict'])):
relevanceDocNumber.append(int(sc['sorted_dict'][i]))
a = file['id_in_news'].index(sc['sorted_dict'][i])
rank = i+1
doc_score = sc['res'][sc['sorted_dict'][i]]
doc_id = sc['sorted_dict'][i]
doc_source = file['sentence_in_source'][a][:]
doc_link = file['sentence_in_link'][a][:]
doc_title = file['sentence_in_title'][a][:]
doc_author = file['sentence_in_author'][a][:]
doc_datetime = file['sentence_in_datetime'][a][:]
doc_contents = file['sentence_in_news'][a][0:400]+'..........'
result.append({'doc_score': doc_score, 'doc_id' : doc_id, 'doc_source': doc_source, 'doc_link': doc_link, 'doc_title': doc_title, 'doc_author': doc_author,
'doc_datetime': doc_datetime, 'doc_contents': doc_contents})
process.append({'terms' : terms, 'TFIDF' : TFIDF, 'docFrequency': docFrequency, 'invDocFrequency': invDocFrequency, 'termFrequency': termFrequency})
return ({'result': result, 'process': process})
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment