Commit 6c019d71 by Sartika Aritonang

Upload New File

parent 6fdc318a
import re
from Sastrawi.Stemmer.Context.Visitor.VisitorProvider import VisitorProvider
from Sastrawi.Stemmer.Filter import TextNormalizer
from Sastrawi.Stemmer.Context.Context import Context
class Stemmer(object):
"""Indonesian Stemmer.
Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS.
@link https://github.com/sastrawi/sastrawi/wiki/Resources
"""
def __init__(self, dictionary):
self.dictionary = dictionary
self.visitor_provider = VisitorProvider()
def get_dictionary(self):
return self.dictionary
def stem(self, text):
"""Stem a text string to its common stem form."""
normalizedText = TextNormalizer.normalize_text(text)
words = normalizedText.split(' ')
stems = []
for word in words:
stems.append(self.stem_word(word))
return ' '.join(stems)
def stem_word(self, word):
"""Stem a word to its common stem form."""
if self.is_plural(word):
return self.stem_plural_word(word)
else:
return self.stem_singular_word(word)
def is_plural(self, word):
#-ku|-mu|-nya
#nikmat-Ku, etc
matches = re.match(r'^(.*)-(ku|mu|nya|lah|kah|tah|pun)$', word)
if matches:
return matches.group(1).find('-') != -1
return word.find('-') != -1
def stem_plural_word(self, plural):
"""Stem a plural word to its common stem form.
Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 76-77.
@link http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
"""
matches = re.match(r'^(.*)-(.*)$', plural)
#translated from PHP conditional check:
#if (!isset($words[1]) || !isset($words[2]))
if not matches:
return plural
words = [matches.group(1), matches.group(2)]
#malaikat-malaikat-nya -> malaikat malaikat-nya
suffix = words[1]
suffixes = ['ku', 'mu', 'nya', 'lah', 'kah', 'tah', 'pun']
matches = re.match(r'^(.*)-(.*)$', words[0])
if suffix in suffixes and matches:
words[0] = matches.group(1)
words[1] = matches.group(2) + '-' + suffix
#berbalas-balasan -> balas
rootWord1 = self.stem_singular_word(words[0])
rootWord2 = self.stem_singular_word(words[1])
#meniru-nirukan -> tiru
if not self.dictionary.contains(words[1]) and rootWord2 == words[1]:
rootWord2 = self.stem_singular_word('me' + words[1])
if rootWord1 == rootWord2:
return rootWord1
else:
return plural
def stem_singular_word(self, word):
"""Stem a singular word to its common stem form."""
context = Context(word, self.dictionary, self.visitor_provider)
context.execute()
return context.result
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment