Skip to content

Commit c0bb36b

Browse files
committed
Adding multivariate ts forecasting
1 parent 94593a0 commit c0bb36b

File tree

7 files changed

+673
-1
lines changed

7 files changed

+673
-1
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/usr/bin/env python
2+
3+
# Source: https://machinelearningmastery.com/how-to-develop-machine-learning-models-for-multivariate-multi-step-air-pollution-time-series-forecasting/
4+
# Author: Jason Brownlee - https://machinelearningmastery.com/author/jasonb/
5+
# Data: https://www.kaggle.com/c/dsg-hackathon/data
6+
7+
import os
8+
import pandas as pd
9+
import numpy as np
10+
import sys
11+
12+
dirname = os.path.dirname(os.path.realpath(__file__))
13+
if len(sys.argv) == 2:
14+
arg = sys.argv[1]
15+
else:
16+
arg = None
17+
18+
# load dataset
19+
def load_data():
20+
return pd.read_csv(dirname + '/data/dsg/TrainingData.csv')
21+
22+
# split the dataset by 'chunkID', return a dict of id to rows
23+
def to_chunks(values, chunk_ix=1):
24+
chunks = {}
25+
# get the unique chunk ids
26+
chunk_ids = np.unique(values[:, chunk_ix])
27+
# group rows by chunk id
28+
for chunk_id in chunk_ids:
29+
selection = values[:, chunk_ix] == chunk_id
30+
chunks[chunk_id] = values[selection, :]
31+
return chunks
32+
33+
# return a list of relative forecast lead times
34+
def get_lead_times():
35+
return [1, 2 ,3, 4, 5, 10, 17, 24, 48, 72]
36+
37+
# split each chunk into train/test sets
38+
def split_train_test(chunks, row_in_chunk_ix=2):
39+
train, test = [], []
40+
# first 5 days of hourly observations for train
41+
cut_point = 5 * 24
42+
# enumerate chunks
43+
for k,rows in chunks.items():
44+
# split chunk rows by 'position_within_chunk'
45+
train_rows = rows[rows[:,row_in_chunk_ix] <= cut_point, :]
46+
test_rows = rows[rows[:,row_in_chunk_ix] > cut_point, :]
47+
if len(train_rows) == 0 or len(test_rows) == 0:
48+
print('>dropping chunk=%d: train=%s, test=%s' % (k, train_rows.shape, test_rows.shape))
49+
continue
50+
# store with chunk id, position in chunk, hour and all targets
51+
indices = [1,2,5] + [x for x in range(56,train_rows.shape[1])]
52+
train.append(train_rows[:, indices])
53+
test.append(test_rows[:, indices])
54+
return train, test
55+
56+
# convert the rows in a test chunk to forecasts
57+
def to_forecasts(test_chunks, row_in_chunk_ix=1):
58+
# get lead times
59+
lead_times = get_lead_times()
60+
# first 5 days of hourly observations for train
61+
cut_point = 5 * 24
62+
forecasts = []
63+
# enumerate each chunk
64+
for rows in test_chunks:
65+
chunk_id = rows[0, 0]
66+
# enumerate each lead time
67+
for tau in lead_times:
68+
# determine the row in chunk we want for the lead time
69+
offset = cut_point + tau
70+
# retrieve data for the lead time using row number in chunk
71+
row_for_tau = rows[rows[:,row_in_chunk_ix]==offset, :]
72+
# check if we have data
73+
if len(row_for_tau) == 0:
74+
# create a mock row [chunk, position, hour] + [nan...]
75+
row = [chunk_id, offset, np.nan] + [np.nan for _ in range(39)]
76+
forecasts.append(row)
77+
else:
78+
# store the forecast row
79+
forecasts.append(row_for_tau[0])
80+
return np.array(forecasts)
81+
82+
def save_data_split(train_rows, test_rows):
83+
if arg == "test" or arg == "t":
84+
np.savetxt( dirname + '/data/dsg/naive_train.csv', train_rows, delimiter=',')
85+
np.savetxt( dirname + '/data/dsg/naive_test.csv', test_rows, delimiter=',')
86+
87+
88+
df = load_data()
89+
# group data by chunks
90+
values = df.values
91+
chunks = to_chunks(values)
92+
print('Total Chunks: %d' % len(chunks))
93+
# split into train/test
94+
train, test = split_train_test(chunks)
95+
# flatten training chunks to rows
96+
train_rows = np.array([row for rows in train for row in rows])
97+
# print(train_rows.shape)
98+
print('Train Rows: %s' % str(train_rows.shape))
99+
# reduce train to forecast lead times only
100+
test_rows = to_forecasts(test)
101+
print('Test Rows: %s' % str(test_rows.shape))
102+
save_data_split(train_rows, test_rows)
103+
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
#!/usr/bin/env python
2+
3+
# Source: https://machinelearningmastery.com/how-to-develop-machine-learning-models-for-multivariate-multi-step-air-pollution-time-series-forecasting/
4+
# Author: Jason Brownlee - https://machinelearningmastery.com/author/jasonb/
5+
# Data: https://www.kaggle.com/c/dsg-hackathon/data
6+
7+
# prepare data
8+
import os
9+
import sys
10+
from numpy import loadtxt
11+
from numpy import nan
12+
from numpy import isnan
13+
from numpy import count_nonzero
14+
from numpy import unique
15+
from numpy import array
16+
from numpy import nanmedian
17+
from numpy import save
18+
19+
dirname = os.path.dirname(os.path.realpath(__file__))
20+
if len(sys.argv) == 2:
21+
arg = sys.argv[1]
22+
else:
23+
arg = None
24+
25+
# split the dataset by 'chunkID', return a list of chunks
26+
def to_chunks(values, chunk_ix=0):
27+
chunks = []
28+
# get the unique chunk ids
29+
chunk_ids = unique(values[:, chunk_ix])
30+
# group rows by chunk id
31+
for chunk_id in chunk_ids:
32+
selection = values[:, chunk_ix] == chunk_id
33+
chunks.append(values[selection, :])
34+
return chunks
35+
36+
# return a list of relative forecast lead times
37+
def get_lead_times():
38+
return [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
39+
40+
# interpolate series of hours (in place) in 24 hour time
41+
def interpolate_hours(hours):
42+
# find the first hour
43+
ix = -1
44+
for i in range(len(hours)):
45+
if not isnan(hours[i]):
46+
ix = i
47+
break
48+
# fill-forward
49+
hour = hours[ix]
50+
for i in range(ix+1, len(hours)):
51+
# increment hour
52+
hour += 1
53+
# check for a fill
54+
if isnan(hours[i]):
55+
hours[i] = hour % 24
56+
# fill-backward
57+
hour = hours[ix]
58+
for i in range(ix-1, -1, -1):
59+
# decrement hour
60+
hour -= 1
61+
# check for a fill
62+
if isnan(hours[i]):
63+
hours[i] = hour % 24
64+
65+
# return true if the array has any non-nan values
66+
def has_data(data):
67+
return count_nonzero(isnan(data)) < len(data)
68+
69+
# impute missing data
70+
def impute_missing(train_chunks, rows, hours, series, col_ix):
71+
# impute missing using the median value for hour in all series
72+
imputed = list()
73+
for i in range(len(series)):
74+
if isnan(series[i]):
75+
# collect all rows across all chunks for the hour
76+
all_rows = list()
77+
for rows in train_chunks:
78+
[all_rows.append(row) for row in rows[rows[:,2]==hours[i]]]
79+
# calculate the central tendency for target
80+
all_rows = array(all_rows)
81+
# fill with median value
82+
value = nanmedian(all_rows[:, col_ix])
83+
if isnan(value):
84+
value = 0.0
85+
imputed.append(value)
86+
else:
87+
imputed.append(series[i])
88+
return imputed
89+
90+
# layout a variable with breaks in the data for missing positions
91+
def variable_to_series(chunk_train, col_ix, n_steps=5*24):
92+
# lay out whole series
93+
data = [nan for _ in range(n_steps)]
94+
# mark all available data
95+
for i in range(len(chunk_train)):
96+
# get position in chunk
97+
position = int(chunk_train[i, 1] - 1)
98+
# store data
99+
data[position] = chunk_train[i, col_ix]
100+
return data
101+
102+
# created input/output patterns from a sequence
103+
def supervised_for_lead_time(series, n_lag, lead_time):
104+
samples = list()
105+
# enumerate observations and create input/output patterns
106+
for i in range(n_lag, len(series)):
107+
end_ix = i + (lead_time - 1)
108+
# check if can create a pattern
109+
if end_ix >= len(series):
110+
break
111+
# retrieve input and output
112+
start_ix = i - n_lag
113+
row = series[start_ix:i] + [series[end_ix]]
114+
samples.append(row)
115+
return samples
116+
117+
# create supervised learning data for each lead time for this target
118+
def target_to_supervised(chunks, rows, hours, col_ix, n_lag):
119+
train_lead_times = list()
120+
# get series
121+
series = variable_to_series(rows, col_ix)
122+
if not has_data(series):
123+
return None, [nan for _ in range(n_lag)]
124+
# impute
125+
imputed = impute_missing(chunks, rows, hours, series, col_ix)
126+
# prepare test sample for chunk-variable
127+
test_sample = array(imputed[-n_lag:])
128+
# enumerate lead times
129+
lead_times = get_lead_times()
130+
for lead_time in lead_times:
131+
# make input/output data from series
132+
train_samples = supervised_for_lead_time(imputed, n_lag, lead_time)
133+
train_lead_times.append(train_samples)
134+
return train_lead_times, test_sample
135+
136+
# prepare training [var][lead time][sample] and test [chunk][var][sample]
137+
def data_prep(chunks, n_lag, n_vars=39):
138+
lead_times = get_lead_times()
139+
train_data = [[list() for _ in range(len(lead_times))] for _ in range(n_vars)]
140+
test_data = [[list() for _ in range(n_vars)] for _ in range(len(chunks))]
141+
# enumerate targets for chunk
142+
for var in range(n_vars):
143+
# convert target number into column number
144+
col_ix = 3 + var
145+
# enumerate chunks to forecast
146+
for c_id in range(len(chunks)):
147+
rows = chunks[c_id]
148+
# prepare sequence of hours for the chunk
149+
hours = variable_to_series(rows, 2)
150+
# interpolate hours
151+
interpolate_hours(hours)
152+
# check for no data
153+
if not has_data(rows[:, col_ix]):
154+
continue
155+
# convert series into training data for each lead time
156+
train, test_sample = target_to_supervised(chunks, rows, hours, col_ix, n_lag)
157+
# store test sample for this var-chunk
158+
test_data[c_id][var] = test_sample
159+
if train is not None:
160+
# store samples per lead time
161+
for lead_time in range(len(lead_times)):
162+
# add all rows to the existing list of rows
163+
train_data[var][lead_time].extend(train[lead_time])
164+
# convert all rows for each var-lead time to a numpy array
165+
for lead_time in range(len(lead_times)):
166+
train_data[var][lead_time] = array(train_data[var][lead_time])
167+
return array(train_data), array(test_data)
168+
169+
# load dataset
170+
train = loadtxt(dirname + '/data/dsg/naive_train.csv', delimiter=',')
171+
test = loadtxt(dirname + '/data/dsg/naive_test.csv', delimiter=',')
172+
# group data by chunks
173+
train_chunks = to_chunks(train)
174+
test_chunks = to_chunks(test)
175+
# convert training data into supervised learning data
176+
n_lag = 12
177+
train_data, test_data = data_prep(train_chunks, n_lag)
178+
print(train_data.shape, test_data.shape)
179+
# save train and test sets to file
180+
save(dirname + '/data/dsg/supervised_train.npy', train_data)
181+
save(dirname + '/data/dsg/supervised_test.npy', test_data)

0 commit comments

Comments
 (0)