Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
news
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Sartika Aritonang
news
Commits
033fb2ae
Commit
033fb2ae
authored
May 28, 2020
by
Sartika Aritonang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload proximity.py
parent
97bf6435
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
287 additions
and
0 deletions
+287
-0
proximity.py
project/news_site/proximity/proximity.py
+287
-0
No files found.
project/news_site/proximity/proximity.py
0 → 100644
View file @
033fb2ae
import
re
import
math
import
string
import
xml.dom.minidom
as
minidom
from
Sastrawi.Stemmer.StemmerFactory
import
StemmerFactory
from
nltk.tokenize
import
word_tokenize
from
sklearn.feature_extraction.text
import
CountVectorizer
def
parse_xml
():
news_collection
=
minidom
.
parse
(
"data/news.xml"
)
news_id
=
news_collection
.
getElementsByTagName
(
'ID'
)
news_source
=
news_collection
.
getElementsByTagName
(
'SOURCE'
)
news_link
=
news_collection
.
getElementsByTagName
(
'LINK'
)
news_title
=
news_collection
.
getElementsByTagName
(
'TITLE'
)
news_author
=
news_collection
.
getElementsByTagName
(
'AUTHOR'
)
news_datetime
=
news_collection
.
getElementsByTagName
(
'DATETIME'
)
news_paragraph
=
news_collection
.
getElementsByTagName
(
'PARAGRAPH'
)
N_news
=
len
(
news_id
)
id_in_news
=
[]
sentence_in_source
=
[]
sentence_in_link
=
[]
sentence_in_title
=
[]
sentence_in_author
=
[]
sentence_in_datetime
=
[]
sentence_in_news
=
[]
for
i
in
range
(
N_news
):
ids
=
news_id
[
i
]
.
firstChild
.
data
id_in_news
.
append
(
ids
)
for
i
in
range
(
N_news
):
sentences
=
news_source
[
i
]
.
firstChild
.
data
sentence_in_source
.
append
(
sentences
)
for
i
in
range
(
N_news
):
sentences
=
news_link
[
i
]
.
firstChild
.
data
sentence_in_link
.
append
(
sentences
)
for
i
in
range
(
N_news
):
sentences
=
news_title
[
i
]
.
firstChild
.
data
sentence_in_title
.
append
(
sentences
)
for
i
in
range
(
N_news
):
sentences
=
news_author
[
i
]
.
firstChild
.
data
sentence_in_author
.
append
(
sentences
)
for
i
in
range
(
N_news
):
sentences
=
news_datetime
[
i
]
.
firstChild
.
data
sentence_in_datetime
.
append
(
sentences
)
for
i
in
range
(
N_news
):
sentences
=
news_paragraph
[
i
]
.
firstChild
.
data
sentence_in_news
.
append
(
sentences
)
return
({
'id_in_news'
:
id_in_news
,
'sentence_in_source'
:
sentence_in_source
,
'sentence_in_link'
:
sentence_in_link
,
'sentence_in_title'
:
sentence_in_title
,
'sentence_in_author'
:
sentence_in_author
,
'sentence_in_datetime'
:
sentence_in_datetime
,
'sentence_in_news'
:
sentence_in_news
})
def
removePunctuation
(
textList
):
for
i
in
range
(
len
(
textList
)):
for
punct
in
string
.
punctuation
:
textList
[
i
]
=
textList
[
i
]
.
replace
(
punct
,
" "
)
textList
[
i
]
=
re
.
sub
(
r'^https?:\/\/.*[\r\n]*'
,
''
,
textList
[
i
],
flags
=
re
.
MULTILINE
)
textList
[
i
]
=
re
.
sub
(
r'“'
,
''
,
textList
[
i
])
textList
[
i
]
=
re
.
sub
(
r'”'
,
''
,
textList
[
i
])
return
textList
def
token
(
sentence
):
token
=
[]
for
word
in
CountVectorizer
()
.
build_tokenizer
()(
sentence
):
token
.
append
(
word
)
return
token
def
tokenize
(
textList
):
tokens
=
[]
for
i
in
range
(
len
(
textList
)):
tokens
.
append
(
token
(
textList
[
i
]))
return
tokens
def
caseFolding
(
textList
):
text
=
[]
for
i
in
range
(
len
(
textList
)):
text
.
append
(
textList
[
i
]
.
lower
())
return
text
def
get_token
(
file
):
#file = parse_xml()
content
=
removePunctuation
(
file
[
'sentence_in_news'
])
title
=
removePunctuation
(
file
[
'sentence_in_title'
])
contents
=
caseFolding
(
content
)
titles
=
caseFolding
(
title
)
token_contents
=
tokenize
(
contents
)
token_titles
=
tokenize
(
titles
)
token
=
[]
for
i
in
token_titles
:
token
.
append
(
i
)
for
j
in
token_contents
:
token
.
append
(
j
)
return
token
def
checkStopword
(
sentence
,
stop_words
):
sentence
=
[
w
for
w
in
sentence
if
not
w
in
stop_words
]
return
sentence
def
stopwordRemove
(
textList
):
with
open
(
"data/id.stopwords.02.01.2016.txt"
,
"r"
)
as
fd
:
stopwords
=
fd
.
read
()
.
splitlines
()
stop_words
=
set
(
stopwords
)
text
=
[]
for
i
in
range
(
len
(
textList
)):
text
.
append
(
checkStopword
(
textList
[
i
],
stop_words
))
return
text
def
numberRemove
(
textList
):
text
=
[]
for
i
in
range
(
len
(
textList
)):
text
.
append
([
w
for
w
in
textList
[
i
]
if
not
any
(
j
.
isdigit
()
for
j
in
w
)])
return
text
def
stemming
(
textList
):
factory
=
StemmerFactory
()
stemmer
=
factory
.
create_stemmer
()
text
=
textList
for
i
in
range
(
len
(
textList
)):
for
j
in
range
(
len
(
textList
[
i
])):
text
[
i
][
j
]
=
stemmer
.
stem
(
text
[
i
][
j
])
return
text
def
getAllTerms
(
textList
):
terms
=
[]
for
i
in
range
(
len
(
textList
)):
for
j
in
range
(
len
(
textList
[
i
])):
terms
.
append
(
textList
[
i
][
j
])
return
sorted
(
set
(
terms
))
def
createIndex
(
textList
):
#file = parse_xml()
#token = get_token()
#tokenize = stopwordRemove(token)
#tokenize = numberRemove(tokenize)
#textList = stemming(tokenize)
terms
=
getAllTerms
(
textList
)
proximity
=
{}
for
term
in
terms
:
position
=
{}
for
n
in
range
(
len
(
textList
)):
if
(
term
in
textList
[
n
]):
position
[(
file
[
'id_in_news'
]
*
2
)[
n
]]
=
[]
for
i
in
range
(
len
(
textList
[
n
])):
if
(
term
==
textList
[
n
][
i
]):
position
[(
file
[
'id_in_news'
]
*
2
)[
n
]]
.
append
(
i
)
proximity
[
term
]
=
position
return
proximity
def
save_indexing
():
indexing
=
createIndex
()
file
=
open
(
'index.txt'
,
'w'
)
file
.
write
(
str
(
indexing
))
file
.
close
()
# save_indexing()
def
open_indexing
():
with
open
(
"data/index.txt"
,
"r"
)
as
fd
:
fi
=
fd
.
read
()
index
=
eval
(
fi
)
return
index
def
removePunctuationQuery
(
textList
):
punctuations
=
'''!()-[]{};:'"
\
,<>./?@#$
%
^&*_~'''
for
x
in
textList
:
if
x
in
punctuations
:
textList
=
textList
.
replace
(
x
,
""
)
return
textList
def
queryPreprocessing
(
query
):
terms
=
[]
query
=
removePunctuationQuery
(
query
)
querys
=
[]
querys
.
append
(
query
)
#querys = caseFolding(querys)
for
i
in
range
(
len
(
querys
)):
querys
[
i
]
=
''
.
join
([
i
for
i
in
querys
[
i
]
if
not
i
.
isdigit
()])
querys
[
i
]
=
re
.
sub
(
r'^https?:\/\/.*[\r\n]*'
,
''
,
querys
[
i
],
flags
=
re
.
MULTILINE
)
terms
.
append
(
word_tokenize
(
querys
[
i
]))
terms
=
numberRemove
(
terms
)
terms
=
stopwordRemove
(
terms
)
terms
=
stemming
(
terms
)
return
terms
def
queryInIndex
(
query
,
index
):
result
=
[]
for
word
in
query
:
if
word
in
index
:
result
.
append
(
word
)
return
result
def
df
(
query
,
index
):
docFreq
=
{}
for
word
in
query
:
if
word
in
index
:
docFreq
[
word
]
=
len
(
index
[
word
])
return
docFreq
def
idf
(
df
,
N
):
inv
=
{}
for
word
in
df
:
inv
[
word
]
=
math
.
log10
(
N
/
df
[
word
])
return
inv
def
tf
(
query
,
index
):
termFreq
=
{}
for
word
in
query
:
freq
=
{}
if
word
in
index
:
for
i
in
index
[
word
]:
freq
[
i
]
=
len
(
index
[
word
][
i
])
termFreq
[
word
]
=
freq
return
termFreq
def
tfidf
(
tf
,
idf
):
w
=
{}
for
word
in
tf
:
wtd
=
{}
for
doc
in
tf
[
word
]:
wtd
[
doc
]
=
(
1
+
(
math
.
log10
(
tf
[
word
][
doc
])))
*
idf
[
word
]
w
[
word
]
=
wtd
return
w
def
score
(
TFIDF
):
res
=
{}
for
i
in
TFIDF
:
for
j
in
TFIDF
[
i
]:
res
[
j
]
=
0
for
i
in
TFIDF
:
for
j
in
TFIDF
[
i
]:
res
[
j
]
=
res
[
j
]
+
TFIDF
[
i
][
j
]
sorted_dict
=
sorted
(
res
,
key
=
res
.
get
,
reverse
=
True
)
return
({
'sorted_dict'
:
sorted_dict
,
'res'
:
res
})
def
results
(
query
):
querys
=
[]
querys
.
append
(
query
)
file
=
parse_xml
()
with
open
(
"data/index.txt"
,
"r"
)
as
fd
:
fi
=
fd
.
read
()
index
=
eval
(
fi
)
terms
=
queryPreprocessing
(
querys
)
querys
=
terms
[
0
]
querys
=
queryInIndex
(
querys
,
index
)
N
=
len
(
file
[
'id_in_news'
])
tfidf_list
=
[]
docFrequency
=
df
(
querys
,
index
)
invDocFrequency
=
idf
(
docFrequency
,
N
)
termFrequency
=
tf
(
querys
,
index
)
TFIDF
=
tfidf
(
termFrequency
,
invDocFrequency
)
sc
=
score
(
TFIDF
)
relevanceDocNumber
=
[]
count
=
0
result
=
[]
process
=
[]
for
i
in
range
(
len
(
sc
[
'sorted_dict'
])):
relevanceDocNumber
.
append
(
int
(
sc
[
'sorted_dict'
][
i
]))
a
=
file
[
'id_in_news'
]
.
index
(
sc
[
'sorted_dict'
][
i
])
rank
=
i
+
1
doc_score
=
sc
[
'res'
][
sc
[
'sorted_dict'
][
i
]]
doc_id
=
sc
[
'sorted_dict'
][
i
]
doc_source
=
file
[
'sentence_in_source'
][
a
][:]
doc_link
=
file
[
'sentence_in_link'
][
a
][:]
doc_title
=
file
[
'sentence_in_title'
][
a
][:]
doc_author
=
file
[
'sentence_in_author'
][
a
][:]
doc_datetime
=
file
[
'sentence_in_datetime'
][
a
][:]
doc_contents
=
file
[
'sentence_in_news'
][
a
][
0
:
400
]
+
'..........'
result
.
append
({
'doc_score'
:
doc_score
,
'doc_id'
:
doc_id
,
'doc_source'
:
doc_source
,
'doc_link'
:
doc_link
,
'doc_title'
:
doc_title
,
'doc_author'
:
doc_author
,
'doc_datetime'
:
doc_datetime
,
'doc_contents'
:
doc_contents
})
process
.
append
({
'terms'
:
terms
,
'TFIDF'
:
TFIDF
,
'docFrequency'
:
docFrequency
,
'invDocFrequency'
:
invDocFrequency
,
'termFrequency'
:
termFrequency
})
return
({
'result'
:
result
,
'process'
:
process
})
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment