"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n"
"training_file = '../data/train.csv'\n",
"test_file = '../data/test.csv'"
+   "source": [
"train = pd.read_csv(training_file, index_col=0)\n",
"test = pd.read_csv(test_file, index_col=0)\n",
"train.head(100)"
+   "source": [
"train.shape"
"test.shape"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
"### Fehlstellen (NaNs) in Keyword und Location identifizieren und ersetzen"
+   "source": [
"datasets= {'Trainingsdaten':train, 'Testdaten':test}\n",
"for k,v in datasets.items():\n",
"    is_NaN = v.isnull()\n",
"    row_has_NaN = is_NaN.any(axis=1)\n",
"    rows_with_NaN = v[row_has_NaN]\n",
"    print('Fehlende Werte in ' + k + ': ' + str(rows_with_NaN.shape[0]))"
"# TODO\n",
"# Fehldaten in den einzelnen columns (location, keyword) identifizieren\n",
"# durch neue Klasse ersetzen"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
"### Location standardisieren"
"# TODO\n",
"# es gibt bestimmt gute packages die das können\n",
"# Ziel ist dass am Ende gleiche Orte, egal wie sie geschrieben sind, der gleichen Klasse zuegordnet werden können"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
"### Umwandlung von Keyword und Location in numerische Features"
"# TODO "
"print(len(train['keyword']))\n",
"print(len(train['keyword'].unique()))"
"keyword = 'hostage'\n",
"train[train.keyword ==keyword]"
