Commit f27cbbb1 by Febby Simanjuntak

inverted done

parent 3e9ccf70
...@@ -632,64 +632,78 @@ ...@@ -632,64 +632,78 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def tokenize(row):\n", "def tokenize(text):\n",
" if row is None or row is '':\n", " words = word_tokenize(text)\n",
" tokens = \"\"\n", " return words"
" else:\n",
" tokens = str(row).split(\" \")[:maxtokens]\n",
" return tokens"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Regular expressions to remove unnecessary characters" "### Normalization"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import re\n", "def to_lowercase(data):\n",
" new_word = []\n",
" for word in data.columns:\n",
" word = word.lower()\n",
" new_word.append(word)\n",
" return new_word\n",
"\n",
"def remove_stopwords(data):\n",
" for col in data.columns:\n",
" if col in stopwords.words('english'):\n",
" data = data.drop(columns = col)\n",
" return data;\n",
"\n", "\n",
"def reg_expressions(row):\n", "def normalize():\n",
" tokens = []\n", " words = to_lowercase(df)\n",
" try:\n", " data = remove_stopwords(df)\n",
" for token in row:\n", " return data"
" token = token.lower() # make all characters lower case\n", ]
" token = re.sub(r'[\\W\\d]', \"\", token)\n", },
" token = token[:maxtokenlen] # truncate token\n", {
" tokens.append(token)\n", "cell_type": "code",
" except:\n", "execution_count": 32,
" token = \"\"\n", "metadata": {},
" tokens.append(token)\n", "outputs": [],
" return tokens" "source": [
"norm = normalize()"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Stop-word removal" "### Inverted Index"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 31,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def stop_word_removal(row):\n", "\n",
" token = [token for token in row if token not in stopwords]\n", "def create_Inverted_index(all_unique_documents):\n",
" token = filter(None, token)\n", " inverted_index = {}\n",
" return token" " for doc_id in range(len(all_unique_documents)):\n",
" for term in all_unique_documents[doc_id]:\n",
" if term not in inverted_index:\n",
" inverted_index[term] = []\n",
" inverted_index[term].append(doc_id) \n",
" return inverted_index"
] ]
}, },
{ {
......
...@@ -632,64 +632,78 @@ ...@@ -632,64 +632,78 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def tokenize(row):\n", "def tokenize(text):\n",
" if row is None or row is '':\n", " words = word_tokenize(text)\n",
" tokens = \"\"\n", " return words"
" else:\n",
" tokens = str(row).split(\" \")[:maxtokens]\n",
" return tokens"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Regular expressions to remove unnecessary characters" "### Normalization"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import re\n", "def to_lowercase(data):\n",
" new_word = []\n",
" for word in data.columns:\n",
" word = word.lower()\n",
" new_word.append(word)\n",
" return new_word\n",
"\n",
"def remove_stopwords(data):\n",
" for col in data.columns:\n",
" if col in stopwords.words('english'):\n",
" data = data.drop(columns = col)\n",
" return data;\n",
"\n", "\n",
"def reg_expressions(row):\n", "def normalize():\n",
" tokens = []\n", " words = to_lowercase(df)\n",
" try:\n", " data = remove_stopwords(df)\n",
" for token in row:\n", " return data"
" token = token.lower() # make all characters lower case\n", ]
" token = re.sub(r'[\\W\\d]', \"\", token)\n", },
" token = token[:maxtokenlen] # truncate token\n", {
" tokens.append(token)\n", "cell_type": "code",
" except:\n", "execution_count": 32,
" token = \"\"\n", "metadata": {},
" tokens.append(token)\n", "outputs": [],
" return tokens" "source": [
"norm = normalize()"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Stop-word removal" "### Inverted Index"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 31,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def stop_word_removal(row):\n", "\n",
" token = [token for token in row if token not in stopwords]\n", "def create_Inverted_index(all_unique_documents):\n",
" token = filter(None, token)\n", " inverted_index = {}\n",
" return token" " for doc_id in range(len(all_unique_documents)):\n",
" for term in all_unique_documents[doc_id]:\n",
" if term not in inverted_index:\n",
" inverted_index[term] = []\n",
" inverted_index[term].append(doc_id) \n",
" return inverted_index"
] ]
}, },
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment