...
 
Commits (5)
......@@ -44,7 +44,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data"
"## Dependencies"
]
},
{
......@@ -54,10 +54,37 @@
"outputs": [],
"source": [
"import math\n",
"import numpy\n",
"import pandas\n",
"import pathlib"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hyperparameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"BATCH_SIZE = 5\n",
"\n",
"# Consider including a validation dataset as well.\n",
"DATASET_TYPES = ('training', 'testing')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare Input and Output Data"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -90,13 +117,16 @@
"metadata": {},
"outputs": [],
"source": [
"data = {}\n",
"input_data = {}\n",
"output_data = {}\n",
"\n",
"for key in raw_data_paths.keys():\n",
" # Including a validation dataset is an option too.\n",
" data[key] = {\n",
" 'training': None,\n",
" 'testing': None,\n",
" }"
" input_data[key] = {}\n",
" output_data[key] = {}\n",
" \n",
" for dataset_type in DATASET_TYPES:\n",
" input_data[key][dataset_type] = None\n",
" output_data[key][dataset_type] = None"
]
},
{
......@@ -105,6 +135,16 @@
"metadata": {},
"outputs": [],
"source": [
"def adjust_type(obj):\n",
" if obj.dtype == numpy.float64:\n",
" return obj.astype(numpy.float32)\n",
" \n",
" if obj.dtype == numpy.int64:\n",
" return obj.astype(numpy.int32)\n",
" \n",
" return obj\n",
"\n",
"\n",
"def to_stationary(time_series):\n",
" stationary = time_series.diff()\n",
" return stationary.fillna(0)\n",
......@@ -116,8 +156,8 @@
" fp_bound = 0.7 * len(series)\n",
" bound = int(math.floor(fp_bound))\n",
" \n",
" training_data = series.iloc[:bound]\n",
" testing_data = series.iloc[bound:]\n",
" training_data = series.values[:bound]\n",
" testing_data = series.values[bound:]\n",
" \n",
" assert(len(training_data) > 0)\n",
" assert(len(testing_data) > 0)\n",
......@@ -125,7 +165,7 @@
" return training_data, testing_data\n",
"\n",
"\n",
"def normalized(training, testing):\n",
"def normalize(training, testing):\n",
" mean = training.mean()\n",
" std = training.std() # TODO: what should ddof be set to?\n",
" \n",
......@@ -135,17 +175,50 @@
" return training, testing\n",
"\n",
"\n",
"def create_input_and_output(sequential_series):\n",
" \"\"\"Convert a sequential time series into input and output arrays.\"\"\"\n",
" \n",
" block_size = BATCH_SIZE + 1 # BATCH_SIZE input elements plus one output element\n",
" assert(len(sequential_series) >= block_size)\n",
" \n",
" # \"sequential series size\" - (BATCH_SIZE - 1) - \"one output element\"\n",
" batched_array_size = len(sequential_series) - BATCH_SIZE\n",
" assert(batched_array_size > 0)\n",
" \n",
" shape = (batched_array_size, BATCH_SIZE)\n",
" input_array = numpy.empty(shape, sequential_series.dtype)\n",
" \n",
" for i in range(input_array.shape[0]):\n",
" start = i\n",
" end = start + BATCH_SIZE\n",
" input_array[i] = sequential_series[start:end]\n",
" \n",
" start = BATCH_SIZE\n",
" end = start + input_array.shape[0]\n",
" output_array = numpy.array(sequential_series[start:end])\n",
" \n",
" return input_array, output_array\n",
"\n",
"\n",
"for data_name, raw_data_path in raw_data_paths.items():\n",
" data_frame = pandas.read_csv(raw_data_path, engine='python', skipfooter=2)\n",
" time_series = data_frame[data_frame.columns[1]]\n",
" time_series = adjust_type(time_series)\n",
" \n",
" time_series = to_stationary(time_series)\n",
" training, testing = split_data(time_series)\n",
" training, testing = normalize(training, testing)\n",
" \n",
" training, testing = normalized(training, testing)\n",
" training = adjust_type(training)\n",
" testing = adjust_type(testing)\n",
" \n",
" data[data_name]['training'] = training\n",
" data[data_name]['testing'] = testing"
" input_training, output_training = create_input_and_output(training)\n",
" input_data[data_name]['training'] = input_training\n",
" output_data[data_name]['training'] = output_training\n",
" \n",
" input_testing, output_testing = create_input_and_output(testing)\n",
" input_data[data_name]['testing'] = input_testing\n",
" output_data[data_name]['testing'] = output_testing"
]
},
{
......@@ -154,10 +227,23 @@
"metadata": {},
"outputs": [],
"source": [
"for value in data.values():\n",
" for s in ('training', 'testing'):\n",
" print(value[s].head())\n",
" print('Rows:', len(value[s]))"
"is_first = True\n",
"for key in raw_data_paths.keys():\n",
" for dataset_type in DATASET_TYPES:\n",
" if is_first:\n",
" is_first = False\n",
" else:\n",
" print()\n",
" \n",
" print('dataset:', key, dataset_type)\n",
" print('input data shape:', input_data[key][dataset_type].shape)\n",
" print('output data shape:', output_data[key][dataset_type].shape)\n",
" \n",
" assert(input_data[key][dataset_type].dtype == output_data[key][dataset_type].dtype)\n",
" print('data type:', input_data[key][dataset_type].dtype)\n",
" \n",
" print(input_data[key][dataset_type][:5])\n",
" print(output_data[key][dataset_type][:5])"
]
}
],
......