Commit 60619c31 by Ventina

Upload New File

parent 5e793744
resource_package = __name__
#
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from itertools import count
import collections
import math
from xml.etree.ElementTree import ElementTree
##############Remove Punctuation, URL and Tokenize###################
def remove_punc_tokenize(sentence):
tokens = []
for punctuation in string.punctuation:
sentence = sentence.replace(punctuation," ")
sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
for w in CountVectorizer().build_tokenizer()(sentence):
tokens.append(w)
return tokens
##############Case Folding########################
def to_lower(tokens):
tokens = [x.lower() for x in tokens]
return tokens
def generate_ngrams(data, n):
ngram=[]
result = []
#menampilkan hasil n-gram per dokumen
for i in range(len(data)):
sequences = [data[i][j:] for j in range(n)]
temp = zip(*sequences)
lst = list(temp)
result.append([" ".join(lst) for lst in lst])
#menggabungkan n-gram semua dokumen dalam bentuk array
for i in range(len(result)):
for j in range(len(result[i])):
ngram.append(result[i][j])
return ngram, result
def main(query):
tree = ElementTree()
tree.parse("apps/data/netflix_show.xml")
all_doc_no = []
all_title = []
all_description = []
all_cast = []
all_year = []
for node in tree.iter("show_id"):
all_doc_no.append(node.text)
for node in tree.iter("title"):
all_title.append(node.text)
for node in tree.iter("description"):
all_description.append(node.text)
for node in tree.iter("cast"):
all_cast.append(node.text)
for node in tree.iter("release_year"):
all_year.append(node.text)
N_DOC = len(all_description)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_title[i] + all_description[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = to_lower(tokens_doc[i])
stop_words = set(stopwords.words('english'))
stopping = []
for i in range(N_DOC):
temp = []
for j in tokens_doc[i]:
if j not in stop_words:
temp.append(j)
stopping.append(temp)
for i in range(N_DOC):
tokens_doc[i] = ([w for w in stopping[i] if not any(j.isdigit() for j in w)])
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stemming = []
for i in range(N_DOC):
temp=[]
for j in tokens_doc[i]:
# print(j)
temp.append(stemmer.stem(j))
stemming.append(temp)
all_tokens = []
for i in range(N_DOC):
for w in stemming[i]:
all_tokens.append(w)
new_sentence = ' '.join([w for w in all_tokens])
for w in CountVectorizer().build_tokenizer()(new_sentence):
all_tokens.append(w)
all_tokens = set(all_tokens)
alls = []
for i in all_tokens:
alls.append(i)
queri=[]
spl = query.split()
for i in range(len(spl)):
if not spl[i].isdigit():
queri.append(spl[i])
punc = []
for i in range(len(queri)):
no_punc = ""
for j in range(len(queri[i])):
if queri[i][j] not in string.punctuation:
no_punc = no_punc + queri[i][j]
punc.append(no_punc)
lower=[]
for i in range(len(punc)):
lower.append(punc[i].lower())
stop = []
for i in range(len(lower)):
if lower[i] not in stop_words:
stop.append(lower[i])
stem = []
for i in range(len(stop)):
stem.append(stemmer.stem(stop[i]))
join_word = ' '.join([w for w in stem])
ngram, ngram_doc = generate_ngrams(stemming, len(stem))
n_gram_index = {}
for ngram_token in ngram:
doc_no = []
for i in range(N_DOC):
if(ngram_token in ngram_doc[i]):
doc_no.append(all_doc_no[i])
n_gram_index[ngram_token] = doc_no
df = []
for i in range(N_DOC):
count = 0
for j in range(len(ngram_doc[i])):
if join_word == ngram_doc[i][j]:
count+=1
df.append(count)
idf = []
for i in range(len(df)):
try:
idf.append(math.log10(N_DOC/df[i]))
except ZeroDivisionError:
idf.append(str(0))
#w(t, d)
#t = term
#d = document
wtd = []
l = []
for i in range(N_DOC):
dic = {}
tf = ngram_doc[i].count(join_word) # menghitung nilai tf
if tf != 0:
score = math.log10(tf) #log10(tf(t,d))
score+=1 # 1 + log(tf(t,d))
score*=idf[i] #tf * idf
idx = all_doc_no[i]
judul = all_title[i]
cast = all_cast[i]
year = all_year[i]
dic['docno'] = idx
dic['judul'] = judul
dic['score'] = score
dic['cast'] = cast
dic['year'] = year
l.append(dic)
wtd.append(l) # [i+1] = defenisi nomor dokumen; score = wtd
# print(score)
hasil = []
hasil.append(sorted(wtd[0], key = lambda x : x['score'], reverse = True))
return hasil
def detail(nomor):
tree = ElementTree()
tree.parse("apps/data/netflix_show.xml")
all_doc_no = []
all_title = []
all_description = []
all_cast = []
all_year = []
for node in tree.iter("show_id"):
all_doc_no.append(node.text)
for node in tree.iter("title"):
# all_headline.append(node.text.replace("\n"," "))
all_title.append(node.text)
head = all_title
for node in tree.iter("description"):
# all_text.append(node.text.replace("\n"," "))
all_description.append(node.text)
for node in tree.iter("cast"):
# all_text.append(node.text.replace("\n"," "))
all_cast.append(node.text)
for node in tree.iter("release_year"):
# all_text.append(node.text.replace("\n"," "))
all_year.append(node.text)
N_DOC = len(all_description)
text = []
judul=[]
cast = []
year = []
hasil = []
id = str(nomor)
for i in range(N_DOC):
check = all_doc_no[i]
if check == id:
text = all_description[i]
judul = all_title[i]
cast = all_cast[i]
year = all_year[i]
return text,judul,cast,year
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment