Commit 8f565798 by Yolanda Nainggolan

added searching, fixed result and lyrics

parent 2d25e3c9
...@@ -2,19 +2,86 @@ resource_package = __name__ ...@@ -2,19 +2,86 @@ resource_package = __name__
import string import string
import re import re
import collections
import math
import pandas as pd
import json
import xml.dom.minidom as minidom
import xml.etree.ElementTree as et
from xml.etree.ElementTree import ElementTree
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tokenize import sent_tokenize, word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from itertools import count from itertools import count
import collections try:
import math from future_builtins import zip
import xml.etree.ElementTree as et except ImportError: # not 2.6+ or is 3.x
from xml.etree.ElementTree import ElementTree try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
##############Show Dataframe########################
def show_dataframe(parse_data):
data = parse_data.getroot()
df_cols = ["DOCNO", "SONG", "ARTIST", "LYRICS"]
rows = []
for node in data:
s_docno = node.find("DOCNO").text if node is not None else None
s_song = node.find("SONG").text if node is not None else None
s_artist = node.find("ARTIST").text if node is not None else None
s_lyrics = node.find("LYRICS").text if node is not None else None
rows.append({"DOCNO": s_docno, "SONG": s_song, "ARTIST": s_artist, "LYRICS": s_lyrics})
DataFrame = pd.DataFrame(rows, columns = df_cols)
dictionary = DataFrame.set_index('DOCNO').T.to_dict('list')
nilai = list(dictionary.values())
nomornya = list(dictionary.keys())
for i in range(0, len(nomornya)):
nomornya[i] = int(nomornya[i])
lagunya = [sublist[0] for sublist in nilai]
artisnya = [sublist[1] for sublist in nilai]
liriknya = [sublist[2] for sublist in nilai]
context = {"DOCNO": nomornya, "SONG": lagunya, "ARTIST": artisnya, "LYRICS": liriknya}
return context
##############N_DOC########################
def data_var(tree):
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_lyrics = []
##############Remove Punctuation, URL and Tokenize################### for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_lyrics.append(node.text)
N_DOC = len(all_lyrics)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_lyrics[i])
return all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc
##############Remove Punctuation###################
def remove_punc_tokenize(sentence): def remove_punc_tokenize(sentence):
tokens = [] tokens = []
for punctuation in string.punctuation: for punctuation in string.punctuation:
sentence = sentence.replace(punctuation," ") sentence = sentence.replace(punctuation," ")
...@@ -29,24 +96,80 @@ def remove_punc_tokenize(sentence): ...@@ -29,24 +96,80 @@ def remove_punc_tokenize(sentence):
def to_lower(tokens): def to_lower(tokens):
tokens = [x.lower() for x in tokens] tokens = [x.lower() for x in tokens]
return tokens return tokens
##############Load Data########################
def load_data(dcmnt_xml):
all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
all_profile = dcmnt_xml.getElementsByTagName('SONG')
all_date = dcmnt_xml.getElementsByTagName('ARTIST')
all_text = dcmnt_xml.getElementsByTagName('LYRICS')
all_pub = dcmnt_xml.getElementsByTagName('PUB')
all_page = dcmnt_xml.getElementsByTagName('PAGE')
N_DOC = len(all_doc_no)
all_sentence_doc_sample = []
for i in range(N_DOC):
sentence_doc_sample = ' '+ all_text[i].firstChild.data
all_sentence_doc_sample.append(sentence_doc_sample)
return all_doc_no, N_DOC, all_sentence_doc_sample
##############Indexing########################
def indexing(N_DOC, tokens_doc, all_doc_no):
all_tokens = []
for i in range(N_DOC):
for w in tokens_doc[i]:
all_tokens.append(w)
new_sentence = ' '.join([w for w in all_tokens])
for w in CountVectorizer().build_tokenizer()(new_sentence):
all_tokens.append(w)
all_tokens = set(all_tokens)
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
for n in range(N_DOC):
if(token in tokens_doc[n]):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
indexnya = json.loads(json.dumps(proximity_index))
words = indexnya.keys()
freq = indexnya.values()
freq = list(freq)
hasil = {}
for key in words:
for value in freq:
hasil[key] = value
freq.remove(value)
break
numb = []
idx = []
for i, j in hasil.items():
numb.append(i)
idx.append(j)
res = {}
for key in numb:
for value in idx:
res[key] = value
idx.remove(value)
break
return res
def generate_ngrams(data, n):
ngram=[]
result = []
#menampilkan hasil n-gram per dokumen
for i in range(len(data)):
sequences = [data[i][j:] for j in range(n)]
temp = zip(*sequences)
lst = list(temp)
result.append([" ".join(lst) for lst in lst])
#menggabungkan n-gram semua dokumen dalam bentuk array
for i in range(len(result)):
for j in range(len(result[i])):
ngram.append(result[i][j])
return ngram, result
from nltk.corpus import stopwords from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) stop_words = set(stopwords.words('english'))
...@@ -63,190 +186,139 @@ def stemming(tokens): ...@@ -63,190 +186,139 @@ def stemming(tokens):
tokens[i] = stemmer.stem(tokens[i]) tokens[i] = stemmer.stem(tokens[i])
return tokens return tokens
def main(query):
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text) def searching(dcmnt_xml, query):
all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
all_song = dcmnt_xml.getElementsByTagName('SONG')
all_lyrics = dcmnt_xml.getElementsByTagName('LYRICS')
N_DOC = len(all_doc_no)
all_sentence_doc = [] all_sentence_doc = []
for i in range(N_DOC): for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i]) sentence_doc = all_song[i].firstChild.data +' '+ all_lyrics[i].firstChild.data
all_sentence_doc.append(sentence_doc)
tokens_doc = [] tokens_doc = []
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i])) tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = to_lower(tokens_doc[i]) tokens_doc[i] = to_lower(tokens_doc[i])
stop_words = set(stopwords.words('english'))
stopping = []
for i in range(N_DOC): for i in range(N_DOC):
temp = [] tokens_doc[i] = stop_word_token(tokens_doc[i])
for j in tokens_doc[i]:
if j not in stop_words:
temp.append(j)
stopping.append(temp)
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = ([w for w in stopping[i] if not any(j.isdigit() for j in w)]) tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
factory = StemmerFactory() for i in range(N_DOC):
stemmer = factory.create_stemmer() tokens_doc[i] = stemming(tokens_doc[i])
stemming = [] all_tokens =[]
for i in range(N_DOC): for i in range(N_DOC):
temp=[]
for j in tokens_doc[i]: for j in tokens_doc[i]:
# print(j) all_tokens.append(j)
temp.append(stemmer.stem(j))
stemming.append(temp) new_sentences = ' '.join([w for w in all_tokens])
all_tokens = [] for j in CountVectorizer().build_tokenizer()(new_sentences):
for i in range(N_DOC): all_tokens.append(j)
for w in stemming[i]:
all_tokens.append(w) all_tokens = set(all_tokens)
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
for n in range(N_DOC):
if(token in tokens_doc[n]):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
kunci = []
nilai = []
for key, value in proximity_index[query].items():
kunci.append(key)
nilai.append(value)
dict = {}
for key in kunci:
for value in nilai:
dict[key] = value
nilai.remove(value)
break
xtree = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
xroot = xtree.getroot()
new_sentence = ' '.join([w for w in all_tokens]) df_cols = ["SONG"]
rows = []
for w in CountVectorizer().build_tokenizer()(new_sentence): for node in xroot:
all_tokens.append(w) lirik = node.find("SONG").text if node is not None else None
rows.append({"SONG": lirik})
all_tokens = set(all_tokens) df = pd.DataFrame(rows, columns = df_cols)
alls = []
for i in all_tokens:
alls.append(i)
queri=[]
spl = query.split()
for i in range(len(spl)):
if not spl[i].isdigit():
queri.append(spl[i])
punc = []
for i in range(len(queri)):
no_punc = ""
for j in range(len(queri[i])):
if queri[i][j] not in string.punctuation:
no_punc = no_punc + queri[i][j]
punc.append(no_punc)
lower=[]
for i in range(len(punc)):
lower.append(punc[i].lower())
stop = []
for i in range(len(lower)):
if lower[i] not in stop_words:
stop.append(lower[i])
stem = []
for i in range(len(stop)):
stem.append(stemmer.stem(stop[i]))
join_word = ' '.join([w for w in stem]) nomor = []
for i in dict:
ngram, ngram_doc = generate_ngrams(stemming, len(stem)) nomor.append(int(i))
n_gram_index = {} judul = []
for ngram_token in ngram: for i in nomor:
doc_no = [] judul.append(df['SONG'][i-1])
for i in range(N_DOC):
if(ngram_token in ngram_doc[i]): hasil = {}
doc_no.append(all_doc_no[i]) for key in nomor:
n_gram_index[ngram_token] = doc_no for value in judul:
hasil[key] = value
judul.remove(value)
break
numb = []
tit = []
df = [] for i, j in hasil.items():
numb.append(i)
tit.append(j)
res = {}
for key in numb:
for value in tit:
res[key] = value
tit.remove(value)
break
return res
for i in range(N_DOC): def detail(id):
count = 0
for j in range(len(ngram_doc[i])): import pandas as pd
if join_word == ngram_doc[i][j]: import xml.etree.ElementTree as et
count+=1 import numpy as np
df.append(count)
idf = []
for i in range(len(df)):
try:
idf.append(math.log10(N_DOC/df[i]))
except ZeroDivisionError:
idf.append(str(0))
#w(t, d)
#t = term
#d = document
wtd = []
l = []
for i in range(N_DOC):
dic = {}
tf = ngram_doc[i].count(join_word) # menghitung nilai tf
if tf != 0:
score = math.log10(tf) #log10(tf(t,d))
score+=1 # 1 + log(tf(t,d))
score*=idf[i] #tf * idf
idx = all_doc_no[i]
judul = all_song[i]
dic['docno'] = idx
dic['judul'] = judul
dic['score'] = score
l.append(dic)
wtd.append(l) # [i+1] = defenisi nomor dokumen; score = wtd
# print(score)
hasil = [] xtree = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
hasil.append(sorted(wtd[0], key = lambda x : x['score'], reverse = True)) xroot = xtree.getroot()
return hasil df_cols = ["SONG", "LYRICS"]
rows = []
def detail(nomor): for node in xroot:
tree = et() judul = node.find("SONG").text if node is not None else None
tree.parse("apps/data/dataset_STBI.xml") lirik = node.find("LYRICS").text if node is not None else None
rows.append({"SONG": judul,
"LYRICS":lirik})
all_doc_no = [] df = pd.DataFrame(rows, columns = df_cols)
all_song = []
all_text = [] lyrics = df['LYRICS'][id-1]
judul = df['SONG'][id-1]
return lyrics ,judul
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
# all_song.append(node.text.replace("\n"," "))
all_song.append(node.text)
head = all_song
for node in tree.iter("LYRICS"):
# all_text.append(node.text.replace("\n"," "))
all_text.append(node.text)
N_DOC = len(all_text)
text = []
judul=[]
hasil = []
id = str(nomor)
for i in range(N_DOC):
check = all_doc_no[i]
if check == id:
text = all_text[i]
judul = all_song[i]
return text,judul
\ No newline at end of file
...@@ -55,15 +55,13 @@ footer { ...@@ -55,15 +55,13 @@ footer {
border-radius: 15px; border-radius: 15px;
padding: 20px; padding: 20px;
margin-top: 10px; margin-top: 10px;
width: auto; width: 100%;
} }
.carda { table{
box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2); table-layout: fixed;
border-radius: 15px; border: 1px solid black;
padding: 20px; width: 100px;
margin-top: 10px;
width: max-content;
} }
.jumbotron { .jumbotron {
...@@ -155,11 +153,6 @@ button:hover span:after { ...@@ -155,11 +153,6 @@ button:hover span:after {
right: 0; right: 0;
} }
table, th, td {
border: 1px solid black;
border-collapse: collapse;
}
form button { form button {
display: inline-block; display: inline-block;
......
@import url('https://fonts.googleapis.com/css?family=Quicksand:400,700&display=swap');
body {
font-family: sans-serif;
}
h2, h3 {
color: #00a2c6
}
footer {
color: white;
background-color: #591a75
}
nav a {
font-size: 18px;
font-weight: 400;
text-decoration: none;
}
nav a:hover {
font-weight: bold;
}
.profile header {
text-align: center;
}
footer {
position: fixed;
left: 0;
bottom: 0;
width: 100%;
padding: 5px;
color: white;
background-color: #440f5c;
text-align: center;
font-weight: bold;
}
.featured-image {
width: 100%;
max-height: 300px;
object-fit: cover;
object-position: center;
}
.card {
box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2);
border-radius: 15px;
padding: 20px;
margin-top: 10px;
}
.jumbotron {
font-size: 20px;
padding: 60px;
text-align: center;
color: white;
background-image: url(https://ak.picdn.net/assets/cms/music_subscription_homepage_banner.jpg);
background-size: cover;
background-repeat: no-repeat;
text-shadow: black 0.3em 0.3em 0.3em;
}
nav {
background-color: #091729;
padding: 5px;
position: sticky;
top: 0;
}
nav a {
font-size: 18px;
font-weight: 400;
text-decoration: none;
color: white;
}
body {
font-family: 'Quicksand', sans-serif;
margin: 0;
padding: 0;
}
main {
padding: 15px;
overflow: auto;
}
#content {
width: 100%;
}
* {
box-sizing: border-box;
}
.button {
display: inline-block;
border-radius: 4px;
background-color: #7c1ca6;
border: none;
color: #FFFFFF;
text-align: center;
font-size: 15px;
padding: 20px;
transition: all 0.5s;
cursor: pointer;
margin: 5px;
}
button span {
cursor: pointer;
display: inline-block;
position: relative;
transition: 0.5s;
}
button span:after {
content: '\00bb';
position: absolute;
opacity: 0;
top: 0;
right: -20px;
transition: 0.5s;
}
button:hover span {
padding-right: 25px;
}
button:hover span:after {
opacity: 1;
right: 0;
}
form button {
display: inline-block;
border-radius: 4px;
background-color: #7c1ca6;
border: none;
color: #FFFFFF;
text-align: center;
font-size: 15px;
padding: 10px;
transition: all 0.5s;
cursor: pointer;
margin: 5px;
width: 80px;
}
\ No newline at end of file
...@@ -5,23 +5,6 @@ ...@@ -5,23 +5,6 @@
<meta name="viewport" content="width=device-width, initial-scale=1"> <meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title> <title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet"> <link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
<style>
#leftbox {
text-align: center;
float:left;
white-space: nowrap;
}
#middlebox{
float:left;
text-align: center;
white-space: nowrap;
}
#middleboxb{
float:left;
text-align: left;
white-space: nowrap;
}
</style>
</head> </head>
<body> <body>
...@@ -38,69 +21,23 @@ ...@@ -38,69 +21,23 @@
</div> </div>
<center><h1>Dataset</h1><br></center> <center><h1>Dataset</h1><br></center>
<article class="carda" style="overflow-x:scroll; overflow-y:scroll;"> <table>
<tr>
<div id = "leftbox"> <th>DOCNO</th>
<table> <th>ARTIST</th>
<tr> <th>SONG</th>
<th>DOCNO</th> <th>LYRICS</th>
</tr> </tr>
{% for i in DOCNO %} {% for i in DOCNO %}
<tr> <tr>
<td>{{ i }}</td> <td>{{ i }}</td>
</tr> <td>{{ j }}</td>
{% endfor %} <td>{{ k }}</td>
<td>{{ l }}</td>
</table> </tr>
</div> {% endfor %}
</table>
<div id = "middlebox">
<table align="left">
<tr>
<th>SONG</th>
</tr>
{% for i in SONG %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table>
<tr>
<th>ARTIST</th>
</tr>
{% for i in ARTIST %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middleboxb">
<table>
<tr>
<th>LYRICS</th>
</tr>
{% for i in LYRICS %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
</article>
</article> </article>
</div> </div>
......
...@@ -4,24 +4,7 @@ ...@@ -4,24 +4,7 @@
<head> <head>
<meta name="viewport" content="width=device-width, initial-scale=1"> <meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title> <title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet"> <link href="../../static/assets/css/trying.min.css" rel="stylesheet">
<style>
#leftbox {
text-align: center;
float:left;
white-space: nowrap;
}
#middlebox{
float:left;
text-align: center;
white-space: nowrap;
}
#middleboxb{
float:left;
text-align: left;
white-space: nowrap;
}
</style>
</head> </head>
<body> <body>
...@@ -37,41 +20,20 @@ ...@@ -37,41 +20,20 @@
</div> </div>
</div> </div>
<center><h1>Proximity Index</h1><br></center> <center><p style="font-size:40px;"><strong>Indexing</strong></p>
<article class="carda" style="overflow-x:scroll; overflow-y:scroll;"> <table width="100%"; border="1px solid black">
<tr>
<div id = "leftbox"> <th>Token</th>
<table> <th>Index</th>
<tr> </tr>
<th>Token</th>
</tr>
{% for i in words %} {% for key, values in res.items %}
<tr> <tr>
<td>{{ i }}</td> <td>{{ key }}</td>
</tr> <td>{{ values }}</td>
{% endfor %} </tr>
{% endfor %}
</table> </table>
</div>
<div id = "middleboxb">
<table align="left">
<tr>
<th>Index</th>
</tr>
{% for i in freq %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
</article>
</article> </article>
</div> </div>
......
<!DOCTYPE html> <!DOCTYPE html>
<html lang="en"> <html lang="en">
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="description" content=""> <meta name="description" content="">
<meta name="author" content=""> <meta name="author" content="">
<title>Inverted Index</title> <title>Inverted Index</title>
<!-- Bootstrap core CSS --> <!-- Bootstrap core CSS -->
<link href="../../static/assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet"> <link href="../../static/assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<!-- Custom fonts for this template --> <!-- Custom fonts for this template -->
<link href="../../static/assets/vendor/fontawesome-free/css/all.min.css" rel="stylesheet"> <link href="../../static/assets/vendor/fontawesome-free/css/all.min.css" rel="stylesheet">
<link href="../../static/assets/vendor/simple-line-icons/css/simple-line-icons.css" rel="stylesheet" type="text/css"> <link href="../../static/assets/vendor/simple-line-icons/css/simple-line-icons.css" rel="stylesheet" type="text/css">
<link href="https://fonts.googleapis.com/css?family=Lato:300,400,700,300italic,400italic,700italic" rel="stylesheet" type="text/css"> <link href="https://fonts.googleapis.com/css?family=Lato:300,400,700,300italic,400italic,700italic" rel="stylesheet" type="text/css">
<!-- Custom styles for this template --> <!-- Custom styles for this template -->
<link href="../../static/assets/css/landing-page.min.css" rel="stylesheet"> <link href="../../static/assets/css/landing-page.min.css" rel="stylesheet">
</head> </head>
<body> <body>
<!-- Navigation --> <nav class="navbar navbar-light bg-light static-top">
<nav class="navbar navbar-light bg-light static-top"> <div class="container">
<div class="container"> <a class="navbar-brand" href="/">Search Simulator</a>
<a class="navbar-brand" href="/">Cari Lagu</a> </div>
<!-- <a class="btn btn-primary" href="#">Pilih Buku</a> </nav>
-->
</div>
</nav>
<!-- Masthead --> <section class="testimonials text-center bg-light">
<!-- <header class="masthead text-white text-center"> <div class="container">
<div class="overlay"></div> <h2 class="mb-3">Lirik Lagu</h2>
<div class="container"> <h4 class="mb-3">No. {{ no }} - {{ judul }} </h4>
<div class="row"> <p>{{ lyrics }}</p>
<div class="col-xl-9 mx-auto"> </div>
<h1 class="mb-5">Silahkan masukkan lirik dari lagu yang ingin Anda temukan</h1> </section>
</div>
<div class="col-md-10 col-lg-8 col-xl-7 mx-auto">
<form method="POST" action="/search">
<div class="form-row">
<div class="col-12 col-md-9 mb-2 mb-md-0">
<input type="text" class="form-control form-control-lg" name="querysearch" placeholder="Masukkan Query Anda...">
</div>
<div class="col-12 col-md-3">
<button type="submit" class="btn btn-block btn-lg btn-primary">Cari!</button>
</div>
</div>
</form>
</div>
</div>
</div>
</header> -->
<script src="../../static/assets/vendor/jquery/jquery.min.js"></script>
<script src="../../static/assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<!-- Testimonials --> </body>
<section class="testimonials text-center bg-light">
<div class="container">
<h2 class="mb-3">Lirik Lagu</h2>
<h4 class="mb-3">No.{{no}} - {{judul}} </h4>
<p>{{text}}</p>
</div>
</section>
<!-- Bootstrap core JavaScript -->
<script src="../../static/assets/vendor/jquery/jquery.min.js"></script>
<script src="../../static/assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
</body>
</html> </html>
...@@ -24,41 +24,26 @@ ...@@ -24,41 +24,26 @@
</head> </head>
<body> <body>
<nav class="navbar navbar-light bg-light static-top">
<!-- Navigation --> <div class="container">
<nav class="navbar navbar-light bg-light static-top"> <a class="navbar-brand" href="/">Search Simulator</a>
<div class="container"> </div>
<a class="navbar-brand" href="/">CariLagu</a> </nav>
<!-- <a class="btn btn-primary" href="#">Pilih Buku</a>
--> <section class="testimonials text-center bg-light">
</div> <div class="container">
</nav> <h2 class="mb-5">Lagu yang sesuai dengan query "{{ query }}"</h2>
<!-- Testimonials --> <div class="row">
<section class="testimonials text-center bg-light"> {% for key, values in res.items %}
<div class="container"> <div class="col-lg-4">
<h2 class="mb-5">Lagu yang sesuai dengan "{{ query }}"</h2> <div class="testimonial-item mx-auto mb-5 mb-lg-0">
{% if hasil %} <h5><a href="/lyric/{{ key }}">Lagu No: {{ key }}</a></h5>
<div class="row"> <h5>"{{ values }}"</h5>
{% for i in hasil %} </div>
{% for j in i %} </div>
{% endfor %}
<div class="col-lg-4"> </div>
<div class="testimonial-item mx-auto mb-5 mb-lg-0"> </div>
<img class="img-fluid rounded-circle mb-3" src="../../static/img/hkbp.jpg" alt="">
<h5><a href="/lyric">Lagu No:{{ j.docno }}</a></h5>
<h5>"{{ j.judul }}"</h5>
<p class="font-weight-light mb-0">score :{{ j.score }}</p>
</div>
</div>
{% endfor %}
{% endfor %}
</div>
{% else %}
<h2 class="mb-5">Lagu dengan lirik: "{{ query }}" tidak ditemukan</h2>
{% endif %}
</div>
</section> </section>
......
from django.shortcuts import render from django.shortcuts import render
from django.http import HttpResponse from django.http import HttpResponse
from InvertedIndexSimulator.inverted import main from InvertedIndexSimulator.inverted import main
from xml.etree.ElementTree import ElementTree
from sklearn.feature_extraction.text import CountVectorizer
from itertools import count
import pandas as pd import pandas as pd
import xml.etree.ElementTree as et import xml.etree.ElementTree as et
import string import string
import re import re
from sklearn.feature_extraction.text import CountVectorizer import json
import xml.dom.minidom as minidom import xml.dom.minidom as minidom
import collections import collections
from itertools import count
try: try:
from future_builtins import zip from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x except ImportError: # not 2.6+ or is 3.x
...@@ -17,138 +19,56 @@ except ImportError: # not 2.6+ or is 3.x ...@@ -17,138 +19,56 @@ except ImportError: # not 2.6+ or is 3.x
except ImportError: except ImportError:
pass pass
def home(request): def home(request):
return render(request, 'apps/home.html') return render(request, 'apps/home.html')
def dataframe(request): def dataframe(request):
parse_data = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml") parse_data = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
data = parse_data.getroot() context = main.show_dataframe(parse_data)
df_cols = ["DOCNO", "SONG", "ARTIST", "LYRICS"]
rows = []
for node in data:
s_docno = node.find("DOCNO").text if node is not None else None
s_song = node.find("SONG").text if node is not None else None
s_artist = node.find("ARTIST").text if node is not None else None
s_lyrics = node.find("LYRICS").text if node is not None else None
rows.append({"DOCNO": s_docno, "SONG": s_song, "ARTIST": s_artist, "LYRICS": s_lyrics})
DataFrame = pd.DataFrame(rows, columns = df_cols)
dictionary = DataFrame.set_index('DOCNO').T.to_dict('list')
nilai = list(dictionary.values())
nomornya = list(dictionary.keys())
lagunya = [sublist[0] for sublist in nilai]
artisnya = [sublist[1] for sublist in nilai]
liriknya = [sublist[2] for sublist in nilai]
context = {"DOCNO": nomornya, "SONG": lagunya, "ARTIST": artisnya, "LYRICS": liriknya}
return render(request, 'apps/dataframe.html', context) return render(request, 'apps/dataframe.html', context)
def preprocessing(request): def preprocessing(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree() tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml") all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc = main.data_var(tree)
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = [] tokens_doc = []
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i])) tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
context = {"tokens_doc": tokens_doc} context = {
"tokens_doc": tokens_doc
}
return render(request, 'apps/preprocessing.html', context) return render(request, 'apps/preprocessing.html', context)
def preprocessing2(request): def preprocessing2(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree() tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml") all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc = main.data_var(tree)
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = [] tokens_doc = []
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i])) tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i]) tokens_doc[i] = main.to_lower(tokens_doc[i])
context = {"tokens_doc": tokens_doc} context = {
"tokens_doc": tokens_doc
}
return render(request, 'apps/preprocessing2.html', context) return render(request, 'apps/preprocessing2.html', context)
def preprocessing3(request): def preprocessing3(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree() tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml") all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc = main.data_var(tree)
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = [] tokens_doc = []
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i])) tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
...@@ -161,37 +81,18 @@ def preprocessing3(request): ...@@ -161,37 +81,18 @@ def preprocessing3(request):
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)]) tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
context = {"tokens_doc": tokens_doc} context = {
"tokens_doc": tokens_doc
}
return render(request, 'apps/preprocessing3.html', context) return render(request, 'apps/preprocessing3.html', context)
def preprocessing4(request): def preprocessing4(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree() tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml") all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc = main.data_var(tree)
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = [] tokens_doc = []
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i])) tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
...@@ -207,33 +108,17 @@ def preprocessing4(request): ...@@ -207,33 +108,17 @@ def preprocessing4(request):
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i]) tokens_doc[i] = main.stemming(tokens_doc[i])
context = {"tokens_doc": tokens_doc} context = {
"tokens_doc": tokens_doc
}
return render(request, 'apps/preprocessing4.html', context) return render(request, 'apps/preprocessing4.html', context)
def indexing(request): def indexing(request):
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import xml.dom.minidom as minidom
dcmnt_xml = minidom.parse("InvertedIndexSimulator/data/dataset_STBI.xml") dcmnt_xml = minidom.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no, N_DOC, all_sentence_doc_sample = main.load_data(dcmnt_xml)
all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
all_profile = dcmnt_xml.getElementsByTagName('SONG')
all_date = dcmnt_xml.getElementsByTagName('ARTIST')
all_text = dcmnt_xml.getElementsByTagName('LYRICS')
all_pub = dcmnt_xml.getElementsByTagName('PUB')
all_page = dcmnt_xml.getElementsByTagName('PAGE')
N_DOC = len(all_doc_no)
all_sentence_doc_sample = []
for i in range(N_DOC):
sentence_doc_sample = ' '+ all_text[i].firstChild.data
all_sentence_doc_sample.append(sentence_doc_sample)
tokens_doc = [] tokens_doc = []
...@@ -249,65 +134,40 @@ def indexing(request): ...@@ -249,65 +134,40 @@ def indexing(request):
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i]) tokens_doc[i] = main.stemming(tokens_doc[i])
all_tokens = [] res = main.indexing(N_DOC, tokens_doc, all_doc_no)
for i in range(N_DOC):
for w in tokens_doc[i]:
all_tokens.append(w)
new_sentence = ' '.join([w for w in all_tokens])
for w in CountVectorizer().build_tokenizer()(new_sentence):
all_tokens.append(w)
all_tokens = set(all_tokens)
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
for n in range(N_DOC):
if(token in tokens_doc[n]):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
import json context = {
indexnya = json.loads(json.dumps(proximity_index)) "res": res,
}
words = indexnya.keys()
freq = indexnya.values()
context = {"words": words, "freq": freq}
return render(request, 'apps/indexing.html', context) return render(request, 'apps/indexing.html', context)
def index(request): def index(request):
return render(request, 'apps/index.html') return render(request, 'apps/index.html')
def lyric(request,id):
text, judul = main.detail(id)
content={
'no': id,
'judul':judul,
'text':text
}
return render(request, 'apps/lyric.html', content)
def result(request): def result(request):
#%% dcmnt_xml = minidom.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
# proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
# for key, value in proximity_index.items():
# # print (key, value)
if request.method == 'POST': if request.method == 'POST':
query = request.POST['querysearch'] query = request.POST['querysearch']
hasil= main.main(query) res = main.searching(dcmnt_xml, query)
content={ content = {
'hasil':hasil, 'res':res,
'query':query 'query':query
} }
return render(request, 'apps/result.html', content)
return render(request, 'apps/result.html', content)
def lyric(request,id):
lyrics, judul = main.detail(id)
content = {
'no': id,
'judul':judul,
'lyrics':lyrics,
}
return render(request, 'apps/lyric.html', content)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment