inverted done

f27cbbb1 · Febby Simanjuntak · 3e9ccf70 · f27cbbb1 · f27cbbb1
Commit f27cbbb1 authored May 08, 2020 by Febby Simanjuntak
Hide whitespace changes
Inline Side-by-side

Showing with 84 additions and 56 deletions

STBI_Project-checkpoint.ipynb .ipynb_checkpoints/STBI_Project-checkpoint.ipynb +42 -28

STBI_Project.ipynb STBI_Project.ipynb +42 -28

No files found.
--- a/.ipynb_checkpoints/STBI_Project-checkpoint.ipynb
+++ b/.ipynb_checkpoints/STBI_Project-checkpoint.ipynb
@@ -632,64 +632,78 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
-    "def tokenize(row):\n",
+    "def tokenize(text):\n",
-    "    if row is None or row is '':\n",
+    "    words = word_tokenize(text)\n",
-    "        tokens = \"\"\n",
+    "    return words"
-    "    else:\n",
-    "        tokens = str(row).split(\" \")[:maxtokens]\n",
-    "    return tokens"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Regular expressions to remove unnecessary characters"
+    "### Normalization"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
-    "import re\n",
+    "def to_lowercase(data):\n",
+    "    new_word = []\n",
+    "    for word in data.columns:\n",
+    "        word = word.lower()\n",
+    "        new_word.append(word)\n",
+    "    return new_word\n",
+    "\n",
+    "def remove_stopwords(data):\n",
+    "    for col in data.columns:\n",
+    "        if col in stopwords.words('english'):\n",
+    "            data = data.drop(columns = col)\n",
+    "    return data;\n",
    "\n",
-    "def reg_expressions(row):\n",
+    "def normalize():\n",
-    "    tokens = []\n",
+    "    words = to_lowercase(df)\n",
-    "    try:\n",
+    "    data = remove_stopwords(df)\n",
-    "        for token in row:\n",
+    "    return data"
-    "            token = token.lower() # make all characters lower case\n",
+   ]
-    "            token = re.sub(r'[\\W\\d]', \"\", token)\n",
+  },
-    "            token = token[:maxtokenlen] # truncate token\n",
+  {
-    "            tokens.append(token)\n",
+   "cell_type": "code",
-    "    except:\n",
+   "execution_count": 32,
-    "        token = \"\"\n",
+   "metadata": {},
-    "        tokens.append(token)\n",
+   "outputs": [],
-    "    return tokens"
+   "source": [
+    "norm = normalize()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Stop-word removal"
+    "### Inverted Index"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
-    "def stop_word_removal(row):\n",
+    "\n",
-    "    token = [token for token in row if token not in stopwords]\n",
+    "def create_Inverted_index(all_unique_documents):\n",
-    "    token = filter(None, token)\n",
+    "    inverted_index = {}\n",
-    "    return token"
+    "    for doc_id in range(len(all_unique_documents)):\n",
+    "        for term in all_unique_documents[doc_id]:\n",
+    "            if term not in inverted_index:\n",
+    "                inverted_index[term] = []\n",
+    "            inverted_index[term].append(doc_id) \n",
+    "    return inverted_index"
   ]
  },
  {

--- a/STBI_Project.ipynb
+++ b/STBI_Project.ipynb
@@ -632,64 +632,78 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
-    "def tokenize(row):\n",
+    "def tokenize(text):\n",
-    "    if row is None or row is '':\n",
+    "    words = word_tokenize(text)\n",
-    "        tokens = \"\"\n",
+    "    return words"
-    "    else:\n",
-    "        tokens = str(row).split(\" \")[:maxtokens]\n",
-    "    return tokens"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Regular expressions to remove unnecessary characters"
+    "### Normalization"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
-    "import re\n",
+    "def to_lowercase(data):\n",
+    "    new_word = []\n",
+    "    for word in data.columns:\n",
+    "        word = word.lower()\n",
+    "        new_word.append(word)\n",
+    "    return new_word\n",
+    "\n",
+    "def remove_stopwords(data):\n",
+    "    for col in data.columns:\n",
+    "        if col in stopwords.words('english'):\n",
+    "            data = data.drop(columns = col)\n",
+    "    return data;\n",
    "\n",
-    "def reg_expressions(row):\n",
+    "def normalize():\n",
-    "    tokens = []\n",
+    "    words = to_lowercase(df)\n",
-    "    try:\n",
+    "    data = remove_stopwords(df)\n",
-    "        for token in row:\n",
+    "    return data"
-    "            token = token.lower() # make all characters lower case\n",
+   ]
-    "            token = re.sub(r'[\\W\\d]', \"\", token)\n",
+  },
-    "            token = token[:maxtokenlen] # truncate token\n",
+  {
-    "            tokens.append(token)\n",
+   "cell_type": "code",
-    "    except:\n",
+   "execution_count": 32,
-    "        token = \"\"\n",
+   "metadata": {},
-    "        tokens.append(token)\n",
+   "outputs": [],
-    "    return tokens"
+   "source": [
+    "norm = normalize()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Stop-word removal"
+    "### Inverted Index"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
-    "def stop_word_removal(row):\n",
+    "\n",
-    "    token = [token for token in row if token not in stopwords]\n",
+    "def create_Inverted_index(all_unique_documents):\n",
-    "    token = filter(None, token)\n",
+    "    inverted_index = {}\n",
-    "    return token"
+    "    for doc_id in range(len(all_unique_documents)):\n",
+    "        for term in all_unique_documents[doc_id]:\n",
+    "            if term not in inverted_index:\n",
+    "                inverted_index[term] = []\n",
+    "            inverted_index[term].append(doc_id) \n",
+    "    return inverted_index"
   ]
  },
  {