Commit 3e9ccf70 by Febby Simanjuntak

stbi update

parent f51158c1
...@@ -12,10 +12,12 @@ ...@@ -12,10 +12,12 @@
"# sys.setdefaultencoding('utf-8')\n", "# sys.setdefaultencoding('utf-8')\n",
"import re\n", "import re\n",
"import csv\n", "import csv\n",
"import nltk\n",
"import string\n", "import string\n",
"import random\n", "import random\n",
"import pandas as pd\n", "import pandas as pd\n",
"import numpy as np\n", "import numpy as np\n",
"from nltk.corpus import stopwords\n",
"from sklearn import metrics\n", "from sklearn import metrics\n",
"from sklearn.svm import SVC" "from sklearn.svm import SVC"
] ]
...@@ -24,32 +26,670 @@ ...@@ -24,32 +26,670 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
"source": [ {
"#\n", "name": "stdout",
"#\n", "output_type": "stream",
"#\n", "text": [
"file=open ('fradulent_emails.txt','r')" "Banyak data = 5172\n"
] ]
}, },
{ {
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": { "data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Email No.</th>\n",
" <th>the</th>\n",
" <th>to</th>\n",
" <th>ect</th>\n",
" <th>and</th>\n",
" <th>for</th>\n",
" <th>of</th>\n",
" <th>a</th>\n",
" <th>you</th>\n",
" <th>hou</th>\n",
" <th>...</th>\n",
" <th>connevey</th>\n",
" <th>jay</th>\n",
" <th>valued</th>\n",
" <th>lay</th>\n",
" <th>infrastructure</th>\n",
" <th>military</th>\n",
" <th>allowing</th>\n",
" <th>ff</th>\n",
" <th>dry</th>\n",
" <th>Prediction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Email 1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Email 2</td>\n",
" <td>8</td>\n",
" <td>13</td>\n",
" <td>24</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>102</td>\n",
" <td>1</td>\n",
" <td>27</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Email 3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Email 4</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>22</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>51</td>\n",
" <td>2</td>\n",
" <td>10</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Email 5</td>\n",
" <td>7</td>\n",
" <td>6</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>57</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Email 6</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>45</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Email 7</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>37</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Email 8</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>21</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Email 9</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>18</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Email 10</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>49</td>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Email 11</td>\n",
" <td>22</td>\n",
" <td>14</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>104</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Email 12</td>\n",
" <td>33</td>\n",
" <td>28</td>\n",
" <td>27</td>\n",
" <td>11</td>\n",
" <td>10</td>\n",
" <td>12</td>\n",
" <td>173</td>\n",
" <td>6</td>\n",
" <td>12</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Email 13</td>\n",
" <td>27</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>8</td>\n",
" <td>106</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Email 14</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Email 15</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Email 16</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>36</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Email 17</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Email 18</td>\n",
" <td>36</td>\n",
" <td>21</td>\n",
" <td>6</td>\n",
" <td>14</td>\n",
" <td>7</td>\n",
" <td>17</td>\n",
" <td>194</td>\n",
" <td>25</td>\n",
" <td>5</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Email 19</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>14</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Email 20</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>32</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>20 rows × 3002 columns</p>\n",
"</div>"
],
"text/plain": [ "text/plain": [
"<_io.TextIOWrapper name='fradulent_emails.txt' mode='r' encoding='cp1252'>" " Email No. the to ect and for of a you hou ... connevey jay \\\n",
"0 Email 1 0 0 1 0 0 0 2 0 0 ... 0 0 \n",
"1 Email 2 8 13 24 6 6 2 102 1 27 ... 0 0 \n",
"2 Email 3 0 0 1 0 0 0 8 0 0 ... 0 0 \n",
"3 Email 4 0 5 22 0 5 1 51 2 10 ... 0 0 \n",
"4 Email 5 7 6 17 1 5 2 57 0 9 ... 0 0 \n",
"5 Email 6 4 5 1 4 2 3 45 1 0 ... 0 0 \n",
"6 Email 7 5 3 1 3 2 1 37 0 0 ... 0 0 \n",
"7 Email 8 0 2 2 3 1 2 21 6 0 ... 0 0 \n",
"8 Email 9 2 2 3 0 0 1 18 0 0 ... 0 0 \n",
"9 Email 10 4 4 35 0 1 0 49 1 16 ... 0 0 \n",
"10 Email 11 22 14 2 9 2 2 104 0 2 ... 0 0 \n",
"11 Email 12 33 28 27 11 10 12 173 6 12 ... 0 0 \n",
"12 Email 13 27 17 3 7 5 8 106 3 0 ... 0 0 \n",
"13 Email 14 4 5 7 1 5 1 37 1 3 ... 0 0 \n",
"14 Email 15 2 4 6 0 3 1 16 0 3 ... 0 0 \n",
"15 Email 16 6 2 1 0 2 0 36 3 1 ... 0 0 \n",
"16 Email 17 3 1 2 2 0 1 17 0 0 ... 0 0 \n",
"17 Email 18 36 21 6 14 7 17 194 25 5 ... 0 0 \n",
"18 Email 19 1 3 1 0 2 0 14 0 0 ... 0 0 \n",
"19 Email 20 3 4 11 0 4 2 32 1 5 ... 0 0 \n",
"\n",
" valued lay infrastructure military allowing ff dry Prediction \n",
"0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 1 0 0 \n",
"2 0 0 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 1 0 0 \n",
"5 0 0 0 0 0 0 0 1 \n",
"6 0 0 0 0 0 0 0 0 \n",
"7 0 0 0 0 0 1 0 1 \n",
"8 0 0 0 0 0 0 0 0 \n",
"9 0 0 0 0 0 0 0 0 \n",
"10 0 0 0 0 0 1 0 0 \n",
"11 0 0 0 0 0 5 0 0 \n",
"12 0 0 0 0 0 4 0 0 \n",
"13 0 0 0 0 0 2 0 0 \n",
"14 0 0 0 0 0 1 0 0 \n",
"15 0 0 0 0 0 0 0 0 \n",
"16 0 0 0 0 0 1 0 1 \n",
"17 0 0 0 0 0 3 0 1 \n",
"18 0 0 0 0 0 0 0 0 \n",
"19 0 0 0 0 0 1 0 0 \n",
"\n",
"[20 rows x 3002 columns]"
] ]
}, },
"execution_count": 3, "execution_count": 2,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"file" "df = pd.read_csv('emails.csv', engine='python')\n",
"print ('Banyak data =', len(df))\n",
"df.head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tokenization"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(row):\n",
" if row is None or row is '':\n",
" tokens = \"\"\n",
" else:\n",
" tokens = str(row).split(\" \")[:maxtokens]\n",
" return tokens"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Regular expressions to remove unnecessary characters"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def reg_expressions(row):\n",
" tokens = []\n",
" try:\n",
" for token in row:\n",
" token = token.lower() # make all characters lower case\n",
" token = re.sub(r'[\\W\\d]', \"\", token)\n",
" token = token[:maxtokenlen] # truncate token\n",
" tokens.append(token)\n",
" except:\n",
" token = \"\"\n",
" tokens.append(token)\n",
" return tokens"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Stop-word removal"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def stop_word_removal(row):\n",
" token = [token for token in row if token not in stopwords]\n",
" token = filter(None, token)\n",
" return token"
] ]
}, },
{ {
......
...@@ -24,31 +24,603 @@ ...@@ -24,31 +24,603 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
"source": [ {
"file=open ('fradulent_emails.txt')" "name": "stdout",
"output_type": "stream",
"text": [
"Banyak data = 5172\n"
] ]
}, },
{ {
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": { "data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Email No.</th>\n",
" <th>the</th>\n",
" <th>to</th>\n",
" <th>ect</th>\n",
" <th>and</th>\n",
" <th>for</th>\n",
" <th>of</th>\n",
" <th>a</th>\n",
" <th>you</th>\n",
" <th>hou</th>\n",
" <th>...</th>\n",
" <th>connevey</th>\n",
" <th>jay</th>\n",
" <th>valued</th>\n",
" <th>lay</th>\n",
" <th>infrastructure</th>\n",
" <th>military</th>\n",
" <th>allowing</th>\n",
" <th>ff</th>\n",
" <th>dry</th>\n",
" <th>Prediction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Email 1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Email 2</td>\n",
" <td>8</td>\n",
" <td>13</td>\n",
" <td>24</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>102</td>\n",
" <td>1</td>\n",
" <td>27</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Email 3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Email 4</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>22</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>51</td>\n",
" <td>2</td>\n",
" <td>10</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Email 5</td>\n",
" <td>7</td>\n",
" <td>6</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>57</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Email 6</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>45</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Email 7</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>37</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Email 8</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>21</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Email 9</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>18</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Email 10</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>49</td>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Email 11</td>\n",
" <td>22</td>\n",
" <td>14</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>104</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Email 12</td>\n",
" <td>33</td>\n",
" <td>28</td>\n",
" <td>27</td>\n",
" <td>11</td>\n",
" <td>10</td>\n",
" <td>12</td>\n",
" <td>173</td>\n",
" <td>6</td>\n",
" <td>12</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Email 13</td>\n",
" <td>27</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>8</td>\n",
" <td>106</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Email 14</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Email 15</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Email 16</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>36</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Email 17</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Email 18</td>\n",
" <td>36</td>\n",
" <td>21</td>\n",
" <td>6</td>\n",
" <td>14</td>\n",
" <td>7</td>\n",
" <td>17</td>\n",
" <td>194</td>\n",
" <td>25</td>\n",
" <td>5</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Email 19</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>14</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Email 20</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>11</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>32</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>20 rows × 3002 columns</p>\n",
"</div>"
],
"text/plain": [ "text/plain": [
"<_io.TextIOWrapper name='fradulent_emails.txt' mode='r' encoding='cp1252'>" " Email No. the to ect and for of a you hou ... connevey jay \\\n",
"0 Email 1 0 0 1 0 0 0 2 0 0 ... 0 0 \n",
"1 Email 2 8 13 24 6 6 2 102 1 27 ... 0 0 \n",
"2 Email 3 0 0 1 0 0 0 8 0 0 ... 0 0 \n",
"3 Email 4 0 5 22 0 5 1 51 2 10 ... 0 0 \n",
"4 Email 5 7 6 17 1 5 2 57 0 9 ... 0 0 \n",
"5 Email 6 4 5 1 4 2 3 45 1 0 ... 0 0 \n",
"6 Email 7 5 3 1 3 2 1 37 0 0 ... 0 0 \n",
"7 Email 8 0 2 2 3 1 2 21 6 0 ... 0 0 \n",
"8 Email 9 2 2 3 0 0 1 18 0 0 ... 0 0 \n",
"9 Email 10 4 4 35 0 1 0 49 1 16 ... 0 0 \n",
"10 Email 11 22 14 2 9 2 2 104 0 2 ... 0 0 \n",
"11 Email 12 33 28 27 11 10 12 173 6 12 ... 0 0 \n",
"12 Email 13 27 17 3 7 5 8 106 3 0 ... 0 0 \n",
"13 Email 14 4 5 7 1 5 1 37 1 3 ... 0 0 \n",
"14 Email 15 2 4 6 0 3 1 16 0 3 ... 0 0 \n",
"15 Email 16 6 2 1 0 2 0 36 3 1 ... 0 0 \n",
"16 Email 17 3 1 2 2 0 1 17 0 0 ... 0 0 \n",
"17 Email 18 36 21 6 14 7 17 194 25 5 ... 0 0 \n",
"18 Email 19 1 3 1 0 2 0 14 0 0 ... 0 0 \n",
"19 Email 20 3 4 11 0 4 2 32 1 5 ... 0 0 \n",
"\n",
" valued lay infrastructure military allowing ff dry Prediction \n",
"0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 1 0 0 \n",
"2 0 0 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 1 0 0 \n",
"5 0 0 0 0 0 0 0 1 \n",
"6 0 0 0 0 0 0 0 0 \n",
"7 0 0 0 0 0 1 0 1 \n",
"8 0 0 0 0 0 0 0 0 \n",
"9 0 0 0 0 0 0 0 0 \n",
"10 0 0 0 0 0 1 0 0 \n",
"11 0 0 0 0 0 5 0 0 \n",
"12 0 0 0 0 0 4 0 0 \n",
"13 0 0 0 0 0 2 0 0 \n",
"14 0 0 0 0 0 1 0 0 \n",
"15 0 0 0 0 0 0 0 0 \n",
"16 0 0 0 0 0 1 0 1 \n",
"17 0 0 0 0 0 3 0 1 \n",
"18 0 0 0 0 0 0 0 0 \n",
"19 0 0 0 0 0 1 0 0 \n",
"\n",
"[20 rows x 3002 columns]"
] ]
}, },
"execution_count": 5, "execution_count": 2,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"file" "df = pd.read_csv('emails.csv', engine='python')\n",
"print ('Banyak data =', len(df))\n",
"df.head(20)"
] ]
}, },
{ {
...@@ -60,7 +632,7 @@ ...@@ -60,7 +632,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -81,7 +653,7 @@ ...@@ -81,7 +653,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -110,7 +682,7 @@ ...@@ -110,7 +682,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment