Commit 2d25e3c9 by Yolanda Nainggolan

add indexing

parent e0d68fdf
...@@ -5,6 +5,23 @@ ...@@ -5,6 +5,23 @@
<meta name="viewport" content="width=device-width, initial-scale=1"> <meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title> <title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet"> <link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
<style>
#leftbox {
text-align: center;
float:left;
white-space: nowrap;
}
#middlebox{
float:left;
text-align: center;
white-space: nowrap;
}
#middleboxb{
float:left;
text-align: left;
white-space: nowrap;
}
</style>
</head> </head>
<body> <body>
...@@ -19,25 +36,50 @@ ...@@ -19,25 +36,50 @@
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button> <button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div> </div>
</div> </div>
<center><h1>Indexing</h1><br></center>
<p><strong>Dengan Proximity Index</strong></p><br></center> <center><h1>Proximity Index</h1><br></center>
<table style="width:100%"> <article class="carda" style="overflow-x:scroll; overflow-y:scroll;">
<div id = "leftbox">
<table>
<tr> <tr>
<th>Apa judulnya ya?</th> <th>Token</th>
</tr> </tr>
{% for i in indexnya %} {% for i in words %}
<tr> <tr>
<td>{{ i }}</td> <td>{{ i }}</td>
</tr> </tr>
{% endfor %} {% endfor %}
</table> </table>
</div>
<div id = "middleboxb">
<table align="left">
<tr>
<th>Index</th>
</tr>
{% for i in freq %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
</article>
</article> </article>
</div> </div>
</main> </main>
<!-- <footer>
<p>&copy; STBI-2020-03</p>
</footer> -->
</body> </body>
......
...@@ -3,6 +3,19 @@ from django.http import HttpResponse ...@@ -3,6 +3,19 @@ from django.http import HttpResponse
from InvertedIndexSimulator.inverted import main from InvertedIndexSimulator.inverted import main
import pandas as pd import pandas as pd
import xml.etree.ElementTree as et import xml.etree.ElementTree as et
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import xml.dom.minidom as minidom
import collections
from itertools import count
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
def home(request): def home(request):
return render(request, 'apps/home.html') return render(request, 'apps/home.html')
...@@ -201,37 +214,31 @@ def preprocessing4(request): ...@@ -201,37 +214,31 @@ def preprocessing4(request):
def indexing(request): def indexing(request):
from sklearn.feature_extraction.text import CountVectorizer
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = [] import string
all_song = [] import re
all_text = [] from sklearn.feature_extraction.text import CountVectorizer
import xml.dom.minidom as minidom
for node in tree.iter("DOCNO"): dcmnt_xml = minidom.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"): all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
all_text.append(node.text) all_profile = dcmnt_xml.getElementsByTagName('SONG')
all_date = dcmnt_xml.getElementsByTagName('ARTIST')
all_text = dcmnt_xml.getElementsByTagName('LYRICS')
all_pub = dcmnt_xml.getElementsByTagName('PUB')
all_page = dcmnt_xml.getElementsByTagName('PAGE')
N_DOC = len(all_text) N_DOC = len(all_doc_no)
all_sentence_doc = [] all_sentence_doc_sample = []
for i in range(N_DOC): for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i]) sentence_doc_sample = ' '+ all_text[i].firstChild.data
all_sentence_doc_sample.append(sentence_doc_sample)
tokens_doc = [] tokens_doc = []
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i])) tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc_sample[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i]) tokens_doc[i] = main.stop_word_token(tokens_doc[i])
...@@ -242,27 +249,18 @@ def indexing(request): ...@@ -242,27 +249,18 @@ def indexing(request):
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i]) tokens_doc[i] = main.stemming(tokens_doc[i])
all_tokens =[] all_tokens = []
for i in range(N_DOC): for i in range(N_DOC):
for j in tokens_doc[i]: for w in tokens_doc[i]:
all_tokens.append(j) all_tokens.append(w)
new_sentences = ' '.join([w for w in all_tokens]) new_sentence = ' '.join([w for w in all_tokens])
for j in CountVectorizer().build_tokenizer()(new_sentences): for w in CountVectorizer().build_tokenizer()(new_sentence):
all_tokens.append(j) all_tokens.append(w)
all_tokens = set(all_tokens) all_tokens = set(all_tokens)
from itertools import count
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
proximity_index = {} proximity_index = {}
for token in all_tokens: for token in all_tokens:
dict_doc_position = {} dict_doc_position = {}
...@@ -271,12 +269,16 @@ def indexing(request): ...@@ -271,12 +269,16 @@ def indexing(request):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token] dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position proximity_index[token] = dict_doc_position
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items())) proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
for key, value in proximity_index.items():
indexnya = (key, value)
context = {"indexnya": indexnya} import json
indexnya = json.loads(json.dumps(proximity_index))
words = indexnya.keys()
freq = indexnya.values()
context = {"words": words, "freq": freq}
return render(request, 'apps/indexing.html', context) return render(request, 'apps/indexing.html', context)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment