Upload New File

60619c31 · Ventina · 5e793744 · 60619c31
Commit 60619c31 authored May 29, 2020 by Ventina
Show whitespace changes
Inline Side-by-side

Showing with 264 additions and 0 deletions

main2.py IR Project /apps/inverted/main2.py +264 -0

No files found.
--- a/IR Project /apps/inverted/main2.py
+++ b/IR Project /apps/inverted/main2.py
+resource_package = __name__
+
+#
+import string
+import re
+from sklearn.feature_extraction.text import CountVectorizer
+import string
+import re
+from sklearn.feature_extraction.text import CountVectorizer
+from nltk.corpus import stopwords
+from nltk.tokenize import sent_tokenize, word_tokenize
+from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
+from itertools import count
+import collections
+import math
+from xml.etree.ElementTree import ElementTree
+
+
+##############Remove Punctuation, URL and Tokenize###################
+def remove_punc_tokenize(sentence):
+    tokens = []
+    for punctuation in string.punctuation:
+        sentence = sentence.replace(punctuation," ")
+
+    sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
+    for w in CountVectorizer().build_tokenizer()(sentence):
+        tokens.append(w)
+    return tokens
+
+
+##############Case Folding########################
+def to_lower(tokens):
+    tokens = [x.lower() for x in tokens]
+    return tokens
+
+def generate_ngrams(data, n):
+    ngram=[]
+    result = []
+
+    #menampilkan hasil n-gram per dokumen
+    for i in range(len(data)):
+        sequences = [data[i][j:] for j in range(n)]
+        temp = zip(*sequences)
+        lst = list(temp)
+        result.append([" ".join(lst) for lst in lst])
+
+    #menggabungkan n-gram semua dokumen dalam bentuk array
+    for i in range(len(result)):
+        for j in range(len(result[i])):
+            ngram.append(result[i][j])
+
+    return ngram, result
+
+def main(query):
+
+    tree = ElementTree()
+    tree.parse("apps/data/netflix_show.xml")
+
+    all_doc_no = []
+    all_title = []
+    all_description = []
+    all_cast = []
+    all_year = []
+
+    for node in tree.iter("show_id"):
+        all_doc_no.append(node.text)
+
+    for node in tree.iter("title"):
+        all_title.append(node.text)
+
+    for node in tree.iter("description"):
+        all_description.append(node.text)
+
+    for node in tree.iter("cast"):
+        all_cast.append(node.text)
+
+    for node in tree.iter("release_year"):
+        all_year.append(node.text)
+
+    N_DOC = len(all_description)
+
+    all_sentence_doc = []
+    for i in range(N_DOC):
+        all_sentence_doc.append(all_title[i] + all_description[i])
+    tokens_doc = []
+    for i in range(N_DOC):
+        tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i]))
+
+    for i in range(N_DOC):
+        tokens_doc[i] = to_lower(tokens_doc[i])
+
+    stop_words = set(stopwords.words('english'))
+
+    stopping = []
+    for i in range(N_DOC):
+        temp = []
+        for j in tokens_doc[i]:
+            if j not in stop_words:
+                temp.append(j)
+        stopping.append(temp)
+
+    for i in range(N_DOC):
+        tokens_doc[i] = ([w for w in stopping[i] if not any(j.isdigit() for j in w)])
+
+    factory = StemmerFactory()
+    stemmer = factory.create_stemmer()
+
+    stemming = []
+    for i in range(N_DOC):
+        temp=[]
+        for j in tokens_doc[i]:
+    #         print(j)
+            temp.append(stemmer.stem(j))
+        stemming.append(temp)
+
+    all_tokens = []
+    for i in range(N_DOC):
+        for w in stemming[i]:
+            all_tokens.append(w)
+
+    new_sentence = ' '.join([w for w in all_tokens])
+
+    for w in CountVectorizer().build_tokenizer()(new_sentence):
+        all_tokens.append(w)
+
+    all_tokens = set(all_tokens)
+    alls = []
+    for i in all_tokens:
+        alls.append(i)
+
+
+    queri=[]
+    spl = query.split()
+    for i in range(len(spl)):
+        if not spl[i].isdigit():
+            queri.append(spl[i])
+
+    punc = []
+    for i in range(len(queri)):
+        no_punc = ""
+        for j in range(len(queri[i])):
+            if queri[i][j] not in string.punctuation:
+                no_punc = no_punc + queri[i][j]
+        punc.append(no_punc)
+
+    lower=[]
+    for i in range(len(punc)):
+        lower.append(punc[i].lower())
+
+    stop = []
+    for i in range(len(lower)):
+        if lower[i] not in stop_words:
+            stop.append(lower[i])
+
+    stem = []
+    for i in range(len(stop)):
+        stem.append(stemmer.stem(stop[i]))
+
+    join_word = ' '.join([w for w in stem])
+
+    ngram, ngram_doc = generate_ngrams(stemming, len(stem))
+
+    n_gram_index = {}
+    for ngram_token in ngram:
+        doc_no = []
+        for i in range(N_DOC):
+            if(ngram_token in ngram_doc[i]):
+                doc_no.append(all_doc_no[i])
+        n_gram_index[ngram_token] = doc_no
+
+    df = []
+
+    for i in range(N_DOC):
+        count = 0
+        for j in range(len(ngram_doc[i])):
+            if join_word == ngram_doc[i][j]:
+                count+=1
+        df.append(count)
+
+    idf = []
+    for i in range(len(df)):
+        try:
+            idf.append(math.log10(N_DOC/df[i]))
+        except ZeroDivisionError:
+            idf.append(str(0))
+
+    #w(t, d)
+    #t = term
+    #d = document
+    wtd = []
+    l = []
+    for i in range(N_DOC):
+        dic = {}
+        tf = ngram_doc[i].count(join_word) # menghitung nilai tf
+        if tf != 0:
+            score = math.log10(tf) #log10(tf(t,d))
+            score+=1 # 1 + log(tf(t,d))
+            score*=idf[i] #tf * idf
+
+            idx = all_doc_no[i]
+            judul = all_title[i]
+            cast = all_cast[i]
+            year = all_year[i]
+
+            dic['docno'] = idx
+            dic['judul'] = judul
+            dic['score'] = score
+            dic['cast'] = cast
+            dic['year'] = year
+
+            l.append(dic)
+    wtd.append(l) # [i+1] = defenisi nomor dokumen; score = wtd
+    #         print(score)
+
+    hasil = []
+    hasil.append(sorted(wtd[0], key = lambda x : x['score'], reverse = True))
+
+    return hasil
+
+def detail(nomor):
+    tree = ElementTree()
+    tree.parse("apps/data/netflix_show.xml")
+
+    all_doc_no = []
+    all_title = []
+    all_description = []
+    all_cast = []
+    all_year = []
+
+    for node in tree.iter("show_id"):
+        all_doc_no.append(node.text)
+
+    for node in tree.iter("title"):
+        # all_headline.append(node.text.replace("\n"," "))
+        all_title.append(node.text)
+        head = all_title
+
+    for node in tree.iter("description"):
+        # all_text.append(node.text.replace("\n"," "))
+        all_description.append(node.text)
+
+    for node in tree.iter("cast"):
+        # all_text.append(node.text.replace("\n"," "))
+        all_cast.append(node.text)
+
+    for node in tree.iter("release_year"):
+        # all_text.append(node.text.replace("\n"," "))
+        all_year.append(node.text)
+
+    N_DOC = len(all_description)
+    text = []
+    judul=[]
+    cast = []
+    year = []
+    hasil = []
+    id = str(nomor)
+    for i in range(N_DOC):
+        check = all_doc_no[i]
+        if check == id:
+            text = all_description[i]
+            judul = all_title[i]
+            cast = all_cast[i]
+            year = all_year[i]
+            return text,judul,cast,year