1+ #!/usr/bin/env python
2+
3+ # Source: https://machinelearningmastery.com/how-to-develop-machine-learning-models-for-multivariate-multi-step-air-pollution-time-series-forecasting/
4+ # Author: Jason Brownlee - https://machinelearningmastery.com/author/jasonb/
5+ # Data: https://www.kaggle.com/c/dsg-hackathon/data
6+
7+ # prepare data
8+ import os
9+ import sys
10+ from numpy import loadtxt
11+ from numpy import nan
12+ from numpy import isnan
13+ from numpy import count_nonzero
14+ from numpy import unique
15+ from numpy import array
16+ from numpy import nanmedian
17+ from numpy import save
18+
19+ dirname = os .path .dirname (os .path .realpath (__file__ ))
20+ if len (sys .argv ) == 2 :
21+ arg = sys .argv [1 ]
22+ else :
23+ arg = None
24+
25+ # split the dataset by 'chunkID', return a list of chunks
26+ def to_chunks (values , chunk_ix = 0 ):
27+ chunks = []
28+ # get the unique chunk ids
29+ chunk_ids = unique (values [:, chunk_ix ])
30+ # group rows by chunk id
31+ for chunk_id in chunk_ids :
32+ selection = values [:, chunk_ix ] == chunk_id
33+ chunks .append (values [selection , :])
34+ return chunks
35+
36+ # return a list of relative forecast lead times
37+ def get_lead_times ():
38+ return [1 , 2 , 3 , 4 , 5 , 10 , 17 , 24 , 48 , 72 ]
39+
40+ # interpolate series of hours (in place) in 24 hour time
41+ def interpolate_hours (hours ):
42+ # find the first hour
43+ ix = - 1
44+ for i in range (len (hours )):
45+ if not isnan (hours [i ]):
46+ ix = i
47+ break
48+ # fill-forward
49+ hour = hours [ix ]
50+ for i in range (ix + 1 , len (hours )):
51+ # increment hour
52+ hour += 1
53+ # check for a fill
54+ if isnan (hours [i ]):
55+ hours [i ] = hour % 24
56+ # fill-backward
57+ hour = hours [ix ]
58+ for i in range (ix - 1 , - 1 , - 1 ):
59+ # decrement hour
60+ hour -= 1
61+ # check for a fill
62+ if isnan (hours [i ]):
63+ hours [i ] = hour % 24
64+
65+ # return true if the array has any non-nan values
66+ def has_data (data ):
67+ return count_nonzero (isnan (data )) < len (data )
68+
69+ # impute missing data
70+ def impute_missing (train_chunks , rows , hours , series , col_ix ):
71+ # impute missing using the median value for hour in all series
72+ imputed = list ()
73+ for i in range (len (series )):
74+ if isnan (series [i ]):
75+ # collect all rows across all chunks for the hour
76+ all_rows = list ()
77+ for rows in train_chunks :
78+ [all_rows .append (row ) for row in rows [rows [:,2 ]== hours [i ]]]
79+ # calculate the central tendency for target
80+ all_rows = array (all_rows )
81+ # fill with median value
82+ value = nanmedian (all_rows [:, col_ix ])
83+ if isnan (value ):
84+ value = 0.0
85+ imputed .append (value )
86+ else :
87+ imputed .append (series [i ])
88+ return imputed
89+
90+ # layout a variable with breaks in the data for missing positions
91+ def variable_to_series (chunk_train , col_ix , n_steps = 5 * 24 ):
92+ # lay out whole series
93+ data = [nan for _ in range (n_steps )]
94+ # mark all available data
95+ for i in range (len (chunk_train )):
96+ # get position in chunk
97+ position = int (chunk_train [i , 1 ] - 1 )
98+ # store data
99+ data [position ] = chunk_train [i , col_ix ]
100+ return data
101+
102+ # created input/output patterns from a sequence
103+ def supervised_for_lead_time (series , n_lag , lead_time ):
104+ samples = list ()
105+ # enumerate observations and create input/output patterns
106+ for i in range (n_lag , len (series )):
107+ end_ix = i + (lead_time - 1 )
108+ # check if can create a pattern
109+ if end_ix >= len (series ):
110+ break
111+ # retrieve input and output
112+ start_ix = i - n_lag
113+ row = series [start_ix :i ] + [series [end_ix ]]
114+ samples .append (row )
115+ return samples
116+
117+ # create supervised learning data for each lead time for this target
118+ def target_to_supervised (chunks , rows , hours , col_ix , n_lag ):
119+ train_lead_times = list ()
120+ # get series
121+ series = variable_to_series (rows , col_ix )
122+ if not has_data (series ):
123+ return None , [nan for _ in range (n_lag )]
124+ # impute
125+ imputed = impute_missing (chunks , rows , hours , series , col_ix )
126+ # prepare test sample for chunk-variable
127+ test_sample = array (imputed [- n_lag :])
128+ # enumerate lead times
129+ lead_times = get_lead_times ()
130+ for lead_time in lead_times :
131+ # make input/output data from series
132+ train_samples = supervised_for_lead_time (imputed , n_lag , lead_time )
133+ train_lead_times .append (train_samples )
134+ return train_lead_times , test_sample
135+
136+ # prepare training [var][lead time][sample] and test [chunk][var][sample]
137+ def data_prep (chunks , n_lag , n_vars = 39 ):
138+ lead_times = get_lead_times ()
139+ train_data = [[list () for _ in range (len (lead_times ))] for _ in range (n_vars )]
140+ test_data = [[list () for _ in range (n_vars )] for _ in range (len (chunks ))]
141+ # enumerate targets for chunk
142+ for var in range (n_vars ):
143+ # convert target number into column number
144+ col_ix = 3 + var
145+ # enumerate chunks to forecast
146+ for c_id in range (len (chunks )):
147+ rows = chunks [c_id ]
148+ # prepare sequence of hours for the chunk
149+ hours = variable_to_series (rows , 2 )
150+ # interpolate hours
151+ interpolate_hours (hours )
152+ # check for no data
153+ if not has_data (rows [:, col_ix ]):
154+ continue
155+ # convert series into training data for each lead time
156+ train , test_sample = target_to_supervised (chunks , rows , hours , col_ix , n_lag )
157+ # store test sample for this var-chunk
158+ test_data [c_id ][var ] = test_sample
159+ if train is not None :
160+ # store samples per lead time
161+ for lead_time in range (len (lead_times )):
162+ # add all rows to the existing list of rows
163+ train_data [var ][lead_time ].extend (train [lead_time ])
164+ # convert all rows for each var-lead time to a numpy array
165+ for lead_time in range (len (lead_times )):
166+ train_data [var ][lead_time ] = array (train_data [var ][lead_time ])
167+ return array (train_data ), array (test_data )
168+
169+ # load dataset
170+ train = loadtxt (dirname + '/data/dsg/naive_train.csv' , delimiter = ',' )
171+ test = loadtxt (dirname + '/data/dsg/naive_test.csv' , delimiter = ',' )
172+ # group data by chunks
173+ train_chunks = to_chunks (train )
174+ test_chunks = to_chunks (test )
175+ # convert training data into supervised learning data
176+ n_lag = 12
177+ train_data , test_data = data_prep (train_chunks , n_lag )
178+ print (train_data .shape , test_data .shape )
179+ # save train and test sets to file
180+ save (dirname + '/data/dsg/supervised_train.npy' , train_data )
181+ save (dirname + '/data/dsg/supervised_test.npy' , test_data )
0 commit comments