From a78b43545a45eda5bbf421c8c4f15376cc4ec78b Mon Sep 17 00:00:00 2001 From: Miriam Amin <miriam_amin@web.de> Date: Wed, 25 Nov 2020 14:54:18 +0100 Subject: [PATCH] set up preprocessing notebook with first preprocessing tasks. for #1 --- notebooks/1_Preprocessing.ipynb | 696 ++++++++++++++++++++++++++++++++ 1 file changed, 696 insertions(+) create mode 100644 notebooks/1_Preprocessing.ipynb diff --git a/notebooks/1_Preprocessing.ipynb b/notebooks/1_Preprocessing.ipynb new file mode 100644 index 0000000..7c1cd42 --- /dev/null +++ b/notebooks/1_Preprocessing.ipynb @@ -0,0 +1,696 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "training_file = '../data/train.csv'\n", + "test_file = '../data/test.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>keyword</th>\n", + " <th>location</th>\n", + " <th>text</th>\n", + " <th>target</th>\n", + " </tr>\n", + " <tr>\n", + " <th>id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>Our Deeds are the Reason of this #earthquake M...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>Forest fire near La Ronge Sask. Canada</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>All residents asked to 'shelter in place' are ...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>13,000 people receive #wildfires evacuation or...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>Just got sent this photo from Ruby #Alaska as ...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>137</th>\n", + " <td>accident</td>\n", + " <td>Charlotte</td>\n", + " <td>9 Mile backup on I-77 South...accident blockin...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>138</th>\n", + " <td>accident</td>\n", + " <td>Baton Rouge, LA</td>\n", + " <td>Has an accident changed your life? We will hel...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>139</th>\n", + " <td>accident</td>\n", + " <td>Hagerstown, MD</td>\n", + " <td>#BREAKING: there was a deadly motorcycle car a...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>141</th>\n", + " <td>accident</td>\n", + " <td>Gloucestershire , UK</td>\n", + " <td>@flowri were you marinading it or was it an ac...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>143</th>\n", + " <td>accident</td>\n", + " <td>NaN</td>\n", + " <td>only had a car for not even a week and got in ...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>100 rows × 4 columns</p>\n", + "</div>" + ], + "text/plain": [ + " keyword location \\\n", + "id \n", + "1 NaN NaN \n", + "4 NaN NaN \n", + "5 NaN NaN \n", + "6 NaN NaN \n", + "7 NaN NaN \n", + ".. ... ... \n", + "137 accident Charlotte \n", + "138 accident Baton Rouge, LA \n", + "139 accident Hagerstown, MD \n", + "141 accident Gloucestershire , UK \n", + "143 accident NaN \n", + "\n", + " text target \n", + "id \n", + "1 Our Deeds are the Reason of this #earthquake M... 1 \n", + "4 Forest fire near La Ronge Sask. Canada 1 \n", + "5 All residents asked to 'shelter in place' are ... 1 \n", + "6 13,000 people receive #wildfires evacuation or... 1 \n", + "7 Just got sent this photo from Ruby #Alaska as ... 1 \n", + ".. ... ... \n", + "137 9 Mile backup on I-77 South...accident blockin... 1 \n", + "138 Has an accident changed your life? We will hel... 0 \n", + "139 #BREAKING: there was a deadly motorcycle car a... 1 \n", + "141 @flowri were you marinading it or was it an ac... 0 \n", + "143 only had a car for not even a week and got in ... 1 \n", + "\n", + "[100 rows x 4 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train = pd.read_csv(training_file, index_col=0)\n", + "test = pd.read_csv(test_file, index_col=0)\n", + "train.head(100)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(7613, 4)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3263, 3)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fehlstellen (NaNs) in Keyword und Location identifizieren und ersetzen" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fehlende Werte in Trainingsdaten: 2533\n", + "Fehlende Werte in Testdaten: 1105\n" + ] + } + ], + "source": [ + "datasets= {'Trainingsdaten':train, 'Testdaten':test}\n", + "for k,v in datasets.items():\n", + " is_NaN = v.isnull()\n", + " row_has_NaN = is_NaN.any(axis=1)\n", + " rows_with_NaN = v[row_has_NaN]\n", + " print('Fehlende Werte in ' + k + ': ' + str(rows_with_NaN.shape[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO\n", + "# Fehldaten in den einzelnen columns (location, keyword) identifizieren\n", + "# durch neue Klasse ersetzen" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Location standardisieren" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO\n", + "# es gibt bestimmt gute packages die das können\n", + "# Ziel ist dass am Ende gleiche Orte, egal wie sie geschrieben sind, der gleichen Klasse zuegordnet werden können" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Umwandlung von Keyword und Location in numerische Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7613\n", + "222\n" + ] + } + ], + "source": [ + "print(len(train['keyword']))\n", + "print(len(train['keyword'].unique()))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>keyword</th>\n", + " <th>location</th>\n", + " <th>text</th>\n", + " <th>target</th>\n", + " </tr>\n", + " <tr>\n", + " <th>id</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>6292</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>Egyptian Militants Tied to ISIS Threaten to Ki...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6293</th>\n", + " <td>hostage</td>\n", + " <td>Vancouver, British Columbia</td>\n", + " <td>‰Û÷Ransomware‰Ûª holds B.C. man‰Ûªs computer f...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6294</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>I always tell my mom to bring me food or I wil...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6296</th>\n", + " <td>hostage</td>\n", + " <td>Starling City</td>\n", + " <td>That moth that held me hostage yesterday has b...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6297</th>\n", + " <td>hostage</td>\n", + " <td>The Great State of Maine</td>\n", + " <td>Islamic State group threatens to kill hostage ...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6299</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>quoted here--&gt;CNN: Purported ISIS video thr...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6300</th>\n", + " <td>hostage</td>\n", + " <td>Glenview to Knoxville</td>\n", + " <td>I'm hungry as a hostage</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6301</th>\n", + " <td>hostage</td>\n", + " <td>Indiana</td>\n", + " <td>Who is Tomislav Salopek the Islamic State's Mo...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6302</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>@susanj357 @msnbc @allinwithchris it's like wa...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6303</th>\n", + " <td>hostage</td>\n", + " <td>Roaming around the world</td>\n", + " <td>Islamic State group in Egypt threatens to kill...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6304</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>You will be held hostage by a radical group.</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6305</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>When u get mugged with ur gf u come up with th...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6306</th>\n", + " <td>hostage</td>\n", + " <td>????</td>\n", + " <td>whO'S THAT SHADOW HOLDIN ME HOSTAGE I'VE BEEN ...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6310</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>Related News: 'ISIS video' threatens hostage -...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6311</th>\n", + " <td>hostage</td>\n", + " <td>Cape Neddick, ME</td>\n", + " <td>@EvaHanderek @MarleyKnysh great times until th...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6312</th>\n", + " <td>hostage</td>\n", + " <td>Global</td>\n", + " <td>The horrific story of being a hostage - The ho...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6314</th>\n", + " <td>hostage</td>\n", + " <td>Mariveles, Bataan</td>\n", + " <td>Islamic State group in Egypt threatens to kill...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6315</th>\n", + " <td>hostage</td>\n", + " <td>ÃŒÃT: 40.562796,-75.488849</td>\n", + " <td>Murfreesboro peeps- I'm hearing Walmart on S R...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6317</th>\n", + " <td>hostage</td>\n", + " <td>Roaming around the world</td>\n", + " <td>Islamic State group in Egypt threatens to kill...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6318</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>I sent my emails why are the TRINNA hold me ho...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6320</th>\n", + " <td>hostage</td>\n", + " <td>Melbourne, FL</td>\n", + " <td>Wtf? Her biological father is holding her host...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6322</th>\n", + " <td>hostage</td>\n", + " <td>New Chicago</td>\n", + " <td>@mylittlepwnies3 @Early__May @AnathemaZhiv @To...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6323</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>If you fill your mind with encouragement and p...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6325</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>I went to pick up my lunch today and the barte...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6326</th>\n", + " <td>hostage</td>\n", + " <td>San Francisco Bay Area</td>\n", + " <td>@pmarca content is held hostage by network due...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6328</th>\n", + " <td>hostage</td>\n", + " <td>Australia</td>\n", + " <td>New ISIS Video: ISIS Threatens to Behead Croat...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6329</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>Sydney hostage crisis has now been recovered f...</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6330</th>\n", + " <td>hostage</td>\n", + " <td>Victorville, CA</td>\n", + " <td>Wut a lonely lunch. I got ditched. And I'm hun...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6331</th>\n", + " <td>hostage</td>\n", + " <td>Washington D.C.</td>\n", + " <td>Nearly 35 years after their release from capti...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6332</th>\n", + " <td>hostage</td>\n", + " <td>NaN</td>\n", + " <td>@gideonstrumpet Have you been held hostage?</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6334</th>\n", + " <td>hostage</td>\n", + " <td>Eugene, Oregon</td>\n", + " <td>Dysfunctional McConnell plans on holding Judic...</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " keyword location \\\n", + "id \n", + "6292 hostage NaN \n", + "6293 hostage Vancouver, British Columbia \n", + "6294 hostage NaN \n", + "6296 hostage Starling City \n", + "6297 hostage The Great State of Maine \n", + "6299 hostage NaN \n", + "6300 hostage Glenview to Knoxville \n", + "6301 hostage Indiana \n", + "6302 hostage NaN \n", + "6303 hostage Roaming around the world \n", + "6304 hostage NaN \n", + "6305 hostage NaN \n", + "6306 hostage ???? \n", + "6310 hostage NaN \n", + "6311 hostage Cape Neddick, ME \n", + "6312 hostage Global \n", + "6314 hostage Mariveles, Bataan \n", + "6315 hostage ÃŒÃT: 40.562796,-75.488849 \n", + "6317 hostage Roaming around the world \n", + "6318 hostage NaN \n", + "6320 hostage Melbourne, FL \n", + "6322 hostage New Chicago \n", + "6323 hostage NaN \n", + "6325 hostage NaN \n", + "6326 hostage San Francisco Bay Area \n", + "6328 hostage Australia \n", + "6329 hostage NaN \n", + "6330 hostage Victorville, CA \n", + "6331 hostage Washington D.C. \n", + "6332 hostage NaN \n", + "6334 hostage Eugene, Oregon \n", + "\n", + " text target \n", + "id \n", + "6292 Egyptian Militants Tied to ISIS Threaten to Ki... 1 \n", + "6293 ‰Û÷Ransomware‰Ûª holds B.C. man‰Ûªs computer f... 0 \n", + "6294 I always tell my mom to bring me food or I wil... 0 \n", + "6296 That moth that held me hostage yesterday has b... 0 \n", + "6297 Islamic State group threatens to kill hostage ... 1 \n", + "6299 quoted here-->CNN: Purported ISIS video thr... 1 \n", + "6300 I'm hungry as a hostage 0 \n", + "6301 Who is Tomislav Salopek the Islamic State's Mo... 0 \n", + "6302 @susanj357 @msnbc @allinwithchris it's like wa... 0 \n", + "6303 Islamic State group in Egypt threatens to kill... 1 \n", + "6304 You will be held hostage by a radical group. 0 \n", + "6305 When u get mugged with ur gf u come up with th... 0 \n", + "6306 whO'S THAT SHADOW HOLDIN ME HOSTAGE I'VE BEEN ... 0 \n", + "6310 Related News: 'ISIS video' threatens hostage -... 1 \n", + "6311 @EvaHanderek @MarleyKnysh great times until th... 1 \n", + "6312 The horrific story of being a hostage - The ho... 1 \n", + "6314 Islamic State group in Egypt threatens to kill... 1 \n", + "6315 Murfreesboro peeps- I'm hearing Walmart on S R... 1 \n", + "6317 Islamic State group in Egypt threatens to kill... 1 \n", + "6318 I sent my emails why are the TRINNA hold me ho... 0 \n", + "6320 Wtf? Her biological father is holding her host... 0 \n", + "6322 @mylittlepwnies3 @Early__May @AnathemaZhiv @To... 1 \n", + "6323 If you fill your mind with encouragement and p... 0 \n", + "6325 I went to pick up my lunch today and the barte... 1 \n", + "6326 @pmarca content is held hostage by network due... 1 \n", + "6328 New ISIS Video: ISIS Threatens to Behead Croat... 1 \n", + "6329 Sydney hostage crisis has now been recovered f... 1 \n", + "6330 Wut a lonely lunch. I got ditched. And I'm hun... 0 \n", + "6331 Nearly 35 years after their release from capti... 0 \n", + "6332 @gideonstrumpet Have you been held hostage? 0 \n", + "6334 Dysfunctional McConnell plans on holding Judic... 0 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "keyword = 'hostage'\n", + "train[train.keyword ==keyword]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- GitLab