Upload proximity.py

033fb2ae · Sartika Aritonang · 97bf6435 · 033fb2ae
Commit 033fb2ae authored May 28, 2020 by Sartika Aritonang
Show whitespace changes
Inline Side-by-side

Showing with 287 additions and 0 deletions

proximity.py project/news_site/proximity/proximity.py +287 -0

No files found.
--- a/project/news_site/proximity/proximity.py
+++ b/project/news_site/proximity/proximity.py
+import re
+import math
+import string
+import xml.dom.minidom as minidom
+from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
+from nltk.tokenize import word_tokenize
+from sklearn.feature_extraction.text import CountVectorizer
+
+def parse_xml():
+    news_collection = minidom.parse("data/news.xml")
+
+    news_id = news_collection.getElementsByTagName('ID')
+    news_source = news_collection.getElementsByTagName('SOURCE')
+    news_link = news_collection.getElementsByTagName('LINK')
+    news_title = news_collection.getElementsByTagName('TITLE')
+    news_author = news_collection.getElementsByTagName('AUTHOR')
+    news_datetime = news_collection.getElementsByTagName('DATETIME')
+    news_paragraph = news_collection.getElementsByTagName('PARAGRAPH')
+    N_news = len(news_id)
+    
+    id_in_news =[]
+    sentence_in_source = []
+    sentence_in_link = []
+    sentence_in_title = []
+    sentence_in_author = []
+    sentence_in_datetime = []
+    sentence_in_news = []
+
+    for i in range(N_news):
+        ids=news_id[i].firstChild.data
+        id_in_news.append(ids)
+
+    for i in range(N_news):
+        sentences = news_source[i].firstChild.data
+        sentence_in_source.append(sentences)
+
+    for i in range(N_news):
+        sentences = news_link[i].firstChild.data
+        sentence_in_link.append(sentences)
+
+    for i in range(N_news):
+        sentences = news_title[i].firstChild.data
+        sentence_in_title.append(sentences)
+
+    for i in range(N_news):
+        sentences = news_author[i].firstChild.data
+        sentence_in_author.append(sentences)
+
+    for i in range(N_news):
+        sentences = news_datetime[i].firstChild.data
+        sentence_in_datetime.append(sentences)
+
+    for i in range(N_news):
+        sentences = news_paragraph[i].firstChild.data
+        sentence_in_news.append(sentences)
+        
+    return ({'id_in_news': id_in_news, 'sentence_in_source' : sentence_in_source, 'sentence_in_link' : sentence_in_link, 'sentence_in_title' : sentence_in_title,
+            'sentence_in_author' : sentence_in_author, 'sentence_in_datetime' : sentence_in_datetime,
+            'sentence_in_news': sentence_in_news})
+
+def removePunctuation(textList):
+    for i in range(len(textList)):
+        for punct in string.punctuation:
+            textList[i] = textList[i].replace(punct, " ")
+        textList[i] = re.sub(r'^https?:\/\/.*[\r\n]*', '', textList[i], flags=re.MULTILINE)
+        textList[i] = re.sub(r'“', '', textList[i])
+        textList[i] = re.sub(r'”', '', textList[i])
+    return textList
+
+def token(sentence):
+    token = []
+    for word in CountVectorizer().build_tokenizer()(sentence):
+        token.append(word)
+    return token
+
+def tokenize(textList):
+    tokens = []
+    for i in range(len(textList)):
+        tokens.append(token(textList[i]))
+    return tokens
+
+def caseFolding(textList):
+    text = []
+    for i in range(len(textList)):
+        text.append(textList[i].lower())
+    return text
+
+def get_token(file):
+    #file = parse_xml()
+    content = removePunctuation(file['sentence_in_news'])
+    title = removePunctuation(file['sentence_in_title'])
+    contents = caseFolding(content)
+    titles = caseFolding(title)
+    token_contents = tokenize(contents)
+    token_titles = tokenize(titles)
+
+    token = []
+    for i in token_titles:
+        token.append(i)
+    for j in token_contents:
+        token.append(j)
+    return token
+
+def checkStopword(sentence, stop_words):
+    sentence = [w for w in sentence if not w in stop_words]
+    return sentence
+
+def stopwordRemove(textList):
+    with open("data/id.stopwords.02.01.2016.txt", "r") as fd:
+        stopwords = fd.read().splitlines()
+    stop_words = set(stopwords)
+    text = []
+    for i in range(len(textList)):
+        text.append(checkStopword(textList[i], stop_words))
+    return text
+
+def numberRemove(textList):
+    text = []
+    for i in range(len(textList)):
+        text.append([w for w in textList[i] if not any(j.isdigit() for j in w)])
+    return text
+
+def stemming(textList):
+    factory = StemmerFactory()
+    stemmer = factory.create_stemmer()
+    text = textList
+    for i in range(len(textList)):
+        for j in range(len(textList[i])):
+            text[i][j] = stemmer.stem(text[i][j])
+    return text
+
+def getAllTerms(textList):
+    terms = []
+    for i in range(len(textList)):
+        for j in range(len(textList[i])):
+            terms.append(textList[i][j])
+    return sorted(set(terms))
+
+def createIndex(textList):
+    #file = parse_xml()
+    #token = get_token()
+    #tokenize = stopwordRemove(token)
+    #tokenize = numberRemove(tokenize)
+    #textList = stemming(tokenize)
+    terms = getAllTerms(textList)
+    proximity = {}
+    for term in terms:
+        position = {}
+        for n in range(len(textList)):
+            if(term in textList[n]):
+                position[(file['id_in_news']*2)[n]] = []
+                for i in range(len(textList[n])):
+                    if(term == textList[n][i]):
+                        position[(file['id_in_news']*2)[n]].append(i)
+        proximity[term] = position
+    return proximity
+
+def save_indexing():
+    indexing = createIndex()
+    file = open('index.txt', 'w')
+    file.write(str(indexing))
+    file.close()
+
+# save_indexing()
+
+def open_indexing():
+    with open("data/index.txt", "r") as fd:
+        fi = fd.read()
+    index = eval(fi)
+    return index
+    
+def removePunctuationQuery(textList):
+    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
+    for x in textList: 
+        if x in punctuations: 
+            textList = textList.replace(x, "") 
+    return textList
+
+def queryPreprocessing(query):
+    terms=[]
+    query = removePunctuationQuery(query)
+    querys = []
+    querys.append(query)
+    #querys = caseFolding(querys)
+
+    for i in range(len(querys)):
+        querys[i]=''.join([i for i in querys[i] if not i.isdigit()])
+        querys[i]=re.sub(r'^https?:\/\/.*[\r\n]*','', querys[i], flags=re.MULTILINE)
+        terms.append(word_tokenize(querys[i]))
+    terms = numberRemove(terms)
+    terms = stopwordRemove(terms)
+    terms = stemming(terms)
+    return terms
+
+def queryInIndex(query, index):
+    result = []
+    for word in query:
+        if word in index:
+            result.append(word)
+    return result
+
+def df(query, index):
+    docFreq = {}
+    for word in query:
+        if word in index:
+            docFreq[word] = len(index[word])
+    return docFreq
+
+def idf(df, N):
+    inv = {}
+    for word in df:
+        inv[word] = math.log10(N/df[word])
+    return inv
+
+def tf(query, index):
+    termFreq = {}
+    for word in query:
+        freq = {}
+        if word in index:
+            for i in index[word]:
+                freq[i] = len(index[word][i])
+        termFreq[word] = freq
+    return termFreq
+
+def tfidf(tf, idf):
+    w = {}
+    for word in tf:
+        wtd = {}
+        for doc in tf[word]:
+            wtd[doc] = (1+(math.log10(tf[word][doc])))*idf[word]
+        w[word] = wtd
+    return w
+    
+def score(TFIDF):
+    res = {}
+    for i in TFIDF:
+        for j in TFIDF[i]:
+            res[j] = 0
+    for i in TFIDF:
+        for j in TFIDF[i]:
+            res[j] = res[j]+TFIDF[i][j]
+    sorted_dict = sorted(res, key=res.get, reverse=True)
+    return ({'sorted_dict': sorted_dict, 'res': res})
+
+
+def results(query):
+    querys = []
+    querys.append(query)
+    file = parse_xml()
+    with open("data/index.txt", "r") as fd:
+        fi = fd.read()
+    index = eval(fi)
+    terms = queryPreprocessing(querys)
+    querys = terms[0]
+    querys = queryInIndex(querys, index)
+
+    N               = len(file['id_in_news'])
+    tfidf_list      = []
+
+    docFrequency    = df(querys, index)
+    invDocFrequency = idf(docFrequency, N)
+    termFrequency   = tf(querys, index)
+    TFIDF           = tfidf(termFrequency, invDocFrequency)
+    sc              = score(TFIDF)
+    
+    relevanceDocNumber = []
+    count = 0
+    result = []
+    process = []
+    
+    for i in range(len(sc['sorted_dict'])):
+        relevanceDocNumber.append(int(sc['sorted_dict'][i]))
+        a = file['id_in_news'].index(sc['sorted_dict'][i])
+        rank = i+1
+        doc_score = sc['res'][sc['sorted_dict'][i]]
+        doc_id = sc['sorted_dict'][i]
+        doc_source = file['sentence_in_source'][a][:]
+        doc_link = file['sentence_in_link'][a][:]
+        doc_title = file['sentence_in_title'][a][:]
+        doc_author = file['sentence_in_author'][a][:]
+        doc_datetime = file['sentence_in_datetime'][a][:]
+        doc_contents = file['sentence_in_news'][a][0:400]+'..........'
+        result.append({'doc_score': doc_score, 'doc_id' : doc_id, 'doc_source': doc_source, 'doc_link': doc_link, 'doc_title': doc_title, 'doc_author': doc_author,
+                 'doc_datetime': doc_datetime, 'doc_contents': doc_contents})
+    process.append({'terms' : terms, 'TFIDF' : TFIDF, 'docFrequency': docFrequency, 'invDocFrequency': invDocFrequency, 'termFrequency': termFrequency})
+    return ({'result': result, 'process': process})
\ No newline at end of file