JBris
diff --git a/‎python/multivariate_forecasting/1_dataset.py‎
Lines changed: 103 additions & 0 deletions b/‎python/multivariate_forecasting/1_dataset.py‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎python/multivariate_forecasting/2_supervise_dataset.py‎
Lines changed: 181 additions & 0 deletions b/‎python/multivariate_forecasting/2_supervise_dataset.py‎
Lines changed: 181 additions & 0 deletions
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+
+# Source: https://machinelearningmastery.com/how-to-develop-machine-learning-models-for-multivariate-multi-step-air-pollution-time-series-forecasting/
+# Author: Jason Brownlee - https://machinelearningmastery.com/author/jasonb/
+# Data: https://www.kaggle.com/c/dsg-hackathon/data
+
+import os
+import pandas as pd
+import numpy as np
+import sys
+
+dirname = os.path.dirname(os.path.realpath(__file__))
+if len(sys.argv) == 2: 
+    arg = sys.argv[1]
+else: 
+    arg = None
+
+# load dataset
+def load_data():
+    return pd.read_csv(dirname + '/data/dsg/TrainingData.csv')
+
+# split the dataset by 'chunkID', return a dict of id to rows
+def to_chunks(values, chunk_ix=1):
+	chunks = {}
+	# get the unique chunk ids
+	chunk_ids = np.unique(values[:, chunk_ix])
+	# group rows by chunk id
+	for chunk_id in chunk_ids:
+		selection = values[:, chunk_ix] == chunk_id
+		chunks[chunk_id] = values[selection, :]
+	return chunks
+
+# return a list of relative forecast lead times
+def get_lead_times():
+	return [1, 2 ,3, 4, 5, 10, 17, 24, 48, 72]
+	
+# split each chunk into train/test sets
+def split_train_test(chunks, row_in_chunk_ix=2):
+	train, test = [], []
+	# first 5 days of hourly observations for train
+	cut_point = 5 * 24
+	# enumerate chunks
+	for k,rows in chunks.items():
+		# split chunk rows by 'position_within_chunk'
+		train_rows = rows[rows[:,row_in_chunk_ix] <= cut_point, :]
+		test_rows = rows[rows[:,row_in_chunk_ix] > cut_point, :]
+		if len(train_rows) == 0 or len(test_rows) == 0:
+			print('>dropping chunk=%d: train=%s, test=%s' % (k, train_rows.shape, test_rows.shape))
+			continue
+		# store with chunk id, position in chunk, hour and all targets
+		indices = [1,2,5] + [x for x in range(56,train_rows.shape[1])]
+		train.append(train_rows[:, indices])
+		test.append(test_rows[:, indices])
+	return train, test
+
+# convert the rows in a test chunk to forecasts
+def to_forecasts(test_chunks, row_in_chunk_ix=1):
+	# get lead times
+	lead_times = get_lead_times()
+	# first 5 days of hourly observations for train
+	cut_point = 5 * 24
+	forecasts = []
+	# enumerate each chunk
+	for rows in test_chunks:
+		chunk_id = rows[0, 0]
+		# enumerate each lead time
+		for tau in lead_times:
+			# determine the row in chunk we want for the lead time
+			offset = cut_point + tau
+			# retrieve data for the lead time using row number in chunk
+			row_for_tau = rows[rows[:,row_in_chunk_ix]==offset, :]
+			# check if we have data
+			if len(row_for_tau) == 0:
+				# create a mock row [chunk, position, hour] + [nan...]
+				row = [chunk_id, offset, np.nan] + [np.nan for _ in range(39)]
+				forecasts.append(row)
+			else:
+				# store the forecast row
+				forecasts.append(row_for_tau[0])
+	return np.array(forecasts)
+
+def save_data_split(train_rows, test_rows):
+    if arg == "test" or arg == "t": 
+        np.savetxt( dirname + '/data/dsg/naive_train.csv', train_rows, delimiter=',')
+        np.savetxt( dirname + '/data/dsg/naive_test.csv', test_rows, delimiter=',')
+
+
+df = load_data()
+# group data by chunks
+values = df.values
+chunks = to_chunks(values)
+print('Total Chunks: %d' % len(chunks))
+# split into train/test
+train, test = split_train_test(chunks)
+# flatten training chunks to rows
+train_rows = np.array([row for rows in train for row in rows])
+# print(train_rows.shape)
+print('Train Rows: %s' % str(train_rows.shape))
+# reduce train to forecast lead times only
+test_rows = to_forecasts(test)
+print('Test Rows: %s' % str(test_rows.shape))
+save_data_split(train_rows, test_rows)
+
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+
+# Source: https://machinelearningmastery.com/how-to-develop-machine-learning-models-for-multivariate-multi-step-air-pollution-time-series-forecasting/
+# Author: Jason Brownlee - https://machinelearningmastery.com/author/jasonb/
+# Data: https://www.kaggle.com/c/dsg-hackathon/data
+
+# prepare data
+import os
+import sys
+from numpy import loadtxt
+from numpy import nan
+from numpy import isnan
+from numpy import count_nonzero
+from numpy import unique
+from numpy import array
+from numpy import nanmedian
+from numpy import save
+
+dirname = os.path.dirname(os.path.realpath(__file__))
+if len(sys.argv) == 2: 
+    arg = sys.argv[1]
+else: 
+    arg = None
+ 
+# split the dataset by 'chunkID', return a list of chunks
+def to_chunks(values, chunk_ix=0):
+	chunks = []
+	# get the unique chunk ids
+	chunk_ids = unique(values[:, chunk_ix])
+	# group rows by chunk id
+	for chunk_id in chunk_ids:
+		selection = values[:, chunk_ix] == chunk_id
+		chunks.append(values[selection, :])
+	return chunks
+ 
+# return a list of relative forecast lead times
+def get_lead_times():
+	return [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
+ 
+# interpolate series of hours (in place) in 24 hour time
+def interpolate_hours(hours):
+	# find the first hour
+	ix = -1
+	for i in range(len(hours)):
+		if not isnan(hours[i]):
+			ix = i
+			break
+	# fill-forward
+	hour = hours[ix]
+	for i in range(ix+1, len(hours)):
+		# increment hour
+		hour += 1
+		# check for a fill
+		if isnan(hours[i]):
+			hours[i] = hour % 24
+	# fill-backward
+	hour = hours[ix]
+	for i in range(ix-1, -1, -1):
+		# decrement hour
+		hour -= 1
+		# check for a fill
+		if isnan(hours[i]):
+			hours[i] = hour % 24
+ 
+# return true if the array has any non-nan values
+def has_data(data):
+	return count_nonzero(isnan(data)) < len(data)
+ 
+# impute missing data
+def impute_missing(train_chunks, rows, hours, series, col_ix):
+	# impute missing using the median value for hour in all series
+	imputed = list()
+	for i in range(len(series)):
+		if isnan(series[i]):
+			# collect all rows across all chunks for the hour
+			all_rows = list()
+			for rows in train_chunks:
+				[all_rows.append(row) for row in rows[rows[:,2]==hours[i]]]
+			# calculate the central tendency for target
+			all_rows = array(all_rows)
+			# fill with median value
+			value = nanmedian(all_rows[:, col_ix])
+			if isnan(value):
+				value = 0.0
+			imputed.append(value)
+		else:
+			imputed.append(series[i])
+	return imputed
+ 
+# layout a variable with breaks in the data for missing positions
+def variable_to_series(chunk_train, col_ix, n_steps=5*24):
+	# lay out whole series
+	data = [nan for _ in range(n_steps)]
+	# mark all available data
+	for i in range(len(chunk_train)):
+		# get position in chunk
+		position = int(chunk_train[i, 1] - 1)
+		# store data
+		data[position] = chunk_train[i, col_ix]
+	return data
+ 
+# created input/output patterns from a sequence
+def supervised_for_lead_time(series, n_lag, lead_time):
+	samples = list()
+	# enumerate observations and create input/output patterns
+	for i in range(n_lag, len(series)):
+		end_ix = i + (lead_time - 1)
+		# check if can create a pattern
+		if end_ix >= len(series):
+			break
+		# retrieve input and output
+		start_ix = i - n_lag
+		row = series[start_ix:i] + [series[end_ix]]
+		samples.append(row)
+	return samples
+ 
+# create supervised learning data for each lead time for this target
+def target_to_supervised(chunks, rows, hours, col_ix, n_lag):
+	train_lead_times = list()
+	# get series
+	series = variable_to_series(rows, col_ix)
+	if not has_data(series):
+		return None, [nan for _ in range(n_lag)]
+	# impute
+	imputed = impute_missing(chunks, rows, hours, series, col_ix)
+	# prepare test sample for chunk-variable
+	test_sample = array(imputed[-n_lag:])
+	# enumerate lead times
+	lead_times = get_lead_times()
+	for lead_time in lead_times:
+		# make input/output data from series
+		train_samples = supervised_for_lead_time(imputed, n_lag, lead_time)
+		train_lead_times.append(train_samples)
+	return train_lead_times, test_sample
+ 
+# prepare training [var][lead time][sample] and test [chunk][var][sample]
+def data_prep(chunks, n_lag, n_vars=39):
+	lead_times = get_lead_times()
+	train_data = [[list() for _ in range(len(lead_times))] for _ in range(n_vars)]
+	test_data = [[list() for _ in range(n_vars)] for _ in range(len(chunks))]
+	# enumerate targets for chunk
+	for var in range(n_vars):
+		# convert target number into column number
+		col_ix = 3 + var
+		# enumerate chunks to forecast
+		for c_id in range(len(chunks)):
+			rows = chunks[c_id]
+			# prepare sequence of hours for the chunk
+			hours = variable_to_series(rows, 2)
+			# interpolate hours
+			interpolate_hours(hours)
+			# check for no data
+			if not has_data(rows[:, col_ix]):
+				continue
+			# convert series into training data for each lead time
+			train, test_sample = target_to_supervised(chunks, rows, hours, col_ix, n_lag)
+			# store test sample for this var-chunk
+			test_data[c_id][var] = test_sample
+			if train is not None:
+				# store samples per lead time
+				for lead_time in range(len(lead_times)):
+					# add all rows to the existing list of rows
+					train_data[var][lead_time].extend(train[lead_time])
+		# convert all rows for each var-lead time to a numpy array
+		for lead_time in range(len(lead_times)):
+			train_data[var][lead_time] = array(train_data[var][lead_time])
+	return array(train_data), array(test_data)
+ 
+# load dataset
+train = loadtxt(dirname + '/data/dsg/naive_train.csv', delimiter=',')
+test = loadtxt(dirname + '/data/dsg/naive_test.csv', delimiter=',')
+# group data by chunks
+train_chunks = to_chunks(train)
+test_chunks = to_chunks(test)
+# convert training data into supervised learning data
+n_lag = 12
+train_data, test_data = data_prep(train_chunks, n_lag)
+print(train_data.shape, test_data.shape)
+# save train and test sets to file
+save(dirname + '/data/dsg/supervised_train.npy', train_data)
+save(dirname + '/data/dsg/supervised_test.npy', test_data)