Methods
The machine learning algorithm Decision Tree is used to derive rules of transitions between successive weather patterns and to generate possible sequences.
Data preparation¶
Date | Year | Month | GWL(t) | GWL(t+1) |
---|---|---|---|---|
1961-01-01 | 1961 | Jan | WW | WW |
1961-01-02 | 1961 | Jan | WW | WW |
1961-01-03 | 1961 | Jan | WW | WZ |
1961-01-04 | 1961 | Jan | WZ | WZ |
1961-01-05 | 1961 | Jan | WZ | WZ |
1961-01-06 | 1961 | Jan | WZ | WZ |
1961-01-07 | 1961 | Jan | WZ | WZ |
1961-01-08 | 1961 | Jan | WZ | WZ |
1961-01-09 | 1961 | Jan | WZ | WZ |
1961-01-10 | 1961 | Jan | WZ | BM |
… | … | … | … | … |
1990-12-22 | 1990 | Dec | WA | WA |
1990-12-23 | 1990 | Dec | WA | WZ |
1990-12-24 | 1990 | Dec | WZ | WZ |
1990-12-25 | 1990 | Dec | WZ | WZ |
1990-12-26 | 1990 | Dec | WZ | WZ |
1990-12-27 | 1990 | Dec | WZ | WZ |
1990-12-28 | 1990 | Dec | WZ | WZ |
1990-12-29 | 1990 | Dec | WZ | WZ |
1990-12-30 | 1990 | Dec | WZ | SWZ |
1990-12-31 | 1990 | Dec | SWZ | SWZ |
Tab.: Daily timeseries of European weather-types (GWL). The columns Month and GWL(t) are used to estimate the target in column GWL(t+1).
Decision Tree¶
Training¶
$$\mathbf{gwl[t]=\phi_{k,m}\cdot gwl[t-1]}$$
Testing¶
$$\mathbf{GWL_{1000}[\tau+1]=\phi_{k,m}\cdot GWL[\tau]}$$
Tree¶
Fig.: Illustration of the decision tree.
Evaluation¶
Fig.: Comparison of the annual frequency of weather-types (GWL): observed (top) and generated (bottom).
Code¶
Importing¶
#!/usr/bin/env python
# coding: utf-8
#get_ipython().run_line_magic('matplotlib', 'notebook')
## import dependencies
from sklearn import tree #For our Decision Tree
import pandas as pd # For our DataFrame
import pydotplus # To create our Decision Tree Graph
from IPython.display import Image # To Display a image of our graph
import numpy as N
import pylab as P
import random
import datetime
from scipy import signal,stats
P.style.use('bmh')
P.rcParams['font.family'] = 'serif'
Generating¶
def gwlgen(ja,je,nt):
print (ja,je,nj)
file = '../csv/gwlneudatum.dat'
jj=N.genfromtxt(file,usecols=(2),skip_header=1,dtype="I")
mm=N.genfromtxt(file,usecols=(1),skip_header=1,dtype="I")
dd=N.genfromtxt(file,usecols=(0),skip_header=1,dtype="I")
gw=N.genfromtxt(file,usecols=(3),skip_header=1,dtype='unicode')
id1 = N.where((jj>=1961)&(jj<=2019))[0]
jd1 = N.where((jj>=ja)&(jj<=je))[0]
jj1 = jj[id1]
mm1 = mm[id1]
dd1 = dd[id1]
gw1 = gw[id1]
go = N.array(list(set(gw)))
gg = N.arange(len(go))
print (go)
file = '../csv/03987.dat'
jj=N.genfromtxt(file,usecols=(2),skip_header=1,dtype="i")
mm=N.genfromtxt(file,usecols=(1),skip_header=1,dtype="i")
tg=N.genfromtxt(file,usecols=(4),skip_header=1,dtype="f")
rr=N.genfromtxt(file,usecols=(13),skip_header=1,dtype="f")
id2 = N.where((jj>=1961)&(jj<=2019))[0]
jd2 = N.where((jj>=ja)&(jj<=je))[0]
jj1 = jj[id2]
mm1 = mm[id2]
tg1 = tg[id2]
rr1 = rr[id2]
ng = 30
nm = 12
tmit = N.zeros((nm,ng),float)
nied = N.zeros((nm,ng),float)
nk = 1000
tmits = N.zeros((nm,ng,nk),float);tmits[:,:,:] = N.nan
nieds = N.zeros((nm,ng,nk),float);nieds[:,:,:] = N.nan
mo = N.arange(1,13,1)
for ig in range(ng):
for im in range(nm):
ind = N.where((mm1==mo[im])&(gw1==go[ig]))[0]
if(len(ind)==0):
tmit[im,ig] = tmit[im-1,ig]
nied[im,ig] = nied[im-1,ig]
else:
#id = random.choices(ind)[0]
#print (id)
#tmit[im,ig] = tg[id]
#nied[im,ig] = rr[id]
tmit[im,ig] = N.nanmean(tg1[ind])
nied[im,ig] = N.nanmean(rr1[ind])
nl = len(ind)
tmits[im,ig,:nl] = tg1[ind]
nieds[im,ig,:nl] = rr1[ind]
##############################################
#f = open('obs_%i-%i.txt'%(1961,2018),'w')
#f.write('%5s%5s%5s%5s%8s%8s\n'%('ja','mo','ta','gw','tg','rr'))
#for d in range(len(id1)):
# ig = N.where(go==gw1[d])[0]
# im = N.where(mo==mm1[d])[0]
# tgx = tmit[im[0],ig[0]]
# rrx = nied[im[0],ig[0]]
# f.write('%5i%5i%5i%5s%8.1f%8.1f\n'%(jj1[d],mm1[d],dd1[d],gw1[d],tgx,rrx))
#f.close()
##############################
mon = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
P.figure(figsize=(13,4))
P.subplot(121)
PC = P.pcolor(tmit,cmap=P.get_cmap('bwr'),vmin=-5,vmax=20,edgecolors='k',linewidths=1)
P.xticks(gg+0.5,go,rotation=45)
P.yticks(mo-0.5,mon)
P.title('Potsdam: Averaged Temperature per Month and GWL [mm/d]',fontsize=10,weight='bold')
for m in mo:
for g in gg:
P.text(g+0.5,m-0.5,'%.1f'%tmit[m-1,g],fontsize=5,ha='center',va='center')
P.subplot(122)
P.pcolor(nied,cmap=P.get_cmap('YlGnBu'),vmin=1,vmax=8,edgecolors='k', linewidths=1)
P.xticks(gg+0.5,go,rotation=45)
P.yticks(mo-0.5,mon)
P.title('Potsdam: Averaged Precipitation per Month and GWL [mm/d]',fontsize=10,weight='bold')
for m in mo:
for g in gg:
P.text(g+0.5,m-0.5,'%.1f'%nied[m-1,g],fontsize=5,ha='center',va='center')
P.tight_layout()
P.savefig('./img/matrix.png',dpi=240,transparent=False,bbox_inches='tight',pad_inches=0.0)
################################
jj1 = jj[jd2]
mm1 = mm[jd2]
tg1 = tg[jd2]
rr1 = rr[jd2]
gw1 = gw[jd1]
gw2 = gw[jd1+1]
nd = len(jj1)
mx1 = []
for m in mm1:
if(m== 1): dum = "Jan"
if(m== 2): dum = "Feb"
if(m== 3): dum = "Mar"
if(m== 4): dum = "Apr"
if(m== 5): dum = "Mai"
if(m== 6): dum = "Jun"
if(m== 7): dum = "Jul"
if(m== 8): dum = "Aug"
if(m== 9): dum = "Sep"
if(m==10): dum = "Okt"
if(m==11): dum = "Nov"
if(m==12): dum = "Dez"
mx1.append(dum)
mx1 = N.array(mx1)
# In[3]:
df = pd.DataFrame()
df['G0'] = gw1
df['M0'] = mx1
df['G1'] = gw2
#print(df)
df.to_html('table0.html')
# In[4]:
one_hot_data = pd.get_dummies(df[['G0','M0']])
# one_hot_data = pd.get_dummies(df[['G0']])
one_hot_data.to_html('table1.html')
clf = tree.DecisionTreeClassifier(criterion="entropy")#"entropy")# Training the Decision Tree
clf_train = clf.fit(one_hot_data,df[['G1']])
# In[6]:
keys = []
for key in one_hot_data.columns:
keys.append(key.split('_')[1])
keys = N.array(keys)
nk = len(keys)
print (keys)
#dot_data = tree.export_graphviz(clf_train,out_file=None,feature_names=keys,class_names=N.array(keys[0:30],str),rounded=True,filled=True,rotate=True)
#graph = pydotplus.graph_from_dot_data(dot_data)
#graph.write_png('./img/tree.png')
#Image(graph.create_png())
###########################
inp = N.array(one_hot_data)
obs = []
mod = []
for d in range(nd):
#prediction = clf_train.predict([inp[d,:]])
probabilit = clf_train.predict_proba([inp[d,:]])
weights = probabilit
nw = weights.shape[1]
elements = one_hot_data.columns[0:nw]
pred = random.choices(elements, weights[0,:],k=1)
pred = pred[0].split('_')[1]
obs.append(gw1[d])
mod.append(pred)
#print('%5s%5s%5s'%(gw1[d],gw2[d],pred))
obs = N.array(obs)
mod = N.array(mod)
######################################
k=datetime.date(2000+nj,1,1)-datetime.date(2000,1,1)
nt = k.days
start = datetime.datetime(2000,1,1)
init = N.zeros(nk,int)
f = open('../csv/gen_%i-%i/gen_%i-%i.txt'%(ja,je,ja,je),'w')
f.write('%5s%5s%5s%5s%8s%8s\n'%('ja','mo','ta','gw','tg','rr'))
ind = N.where((keys=='WZ')|(keys=='Jan'))[0]
init[ind]=1
gen = []
for t in range(nt):
date = start + datetime.timedelta(days=t)
if(t>0):
m = date.month
if(m==1): mm = "Jan"
if(m==2): mm = "Feb"
if(m==3): mm = "Mar"
if(m==4): mm = "Apr"
if(m==5): mm = "Mai"
if(m==6): mm = "Jun"
if(m==7): mm = "Jul"
if(m==8): mm = "Aug"
if(m==9): mm = "Sep"
if(m==10): mm = "Okt"
if(m==11): mm = "Nov"
if(m==12): mm = "Dez"
init = N.zeros(nk,int)
ind = N.where((keys==kk)|(keys==mm))[0]
init[ind]=1
probabilit = clf_train.predict_proba([init])
weights = probabilit
#nw = weights.shape[1]
elements = keys[0:nw]
pred = random.choices(elements, weights[0,:],k=1)
kk = pred[0]
jx = date.year
mx = date.month
dx = date.day
gw = pred[0]
ig = N.where(go==gw)[0]
im = N.where(mo==mx)[0]
tg = tmit[im[0],ig[0]]
rr = nied[im[0],ig[0]]
tgs = tmits[im[0],ig[0],:]
rrs = nieds[im[0],ig[0],:]
tgs = tgs[~N.isnan(tgs)]
rrs = rrs[~N.isnan(rrs)]
nums = list(N.arange(len(tgs)))
if(len(nums)>0):
ran = random.choice(nums)
tg = tgs[ran]
rr = rrs[ran]
f.write('%5i%5i%5i%5s%8.1f%8.1f\n'%(jx,mx,dx,gw,tg,rr))
else:
f.write('%5i%5i%5i%5s%8.1f%8.1f\n'%(jx,mx,dx,gw,tg,rr))
#f.write('%5i%5i%5i%5s%8.1f%8.1f\n'%(jx,mx,dx,gw,tg,rr))
gen.append(gw)
f.close()
gen = N.array(gen)
# In[7]:
fig = P.figure(figsize=(10,6))
fig.suptitle('Weather-Type Sequence Generator using Decision Trees: Training Period (%i-%i) N=%i'%(ja,je,nj), y=1.03, fontsize=12,weight='bold')
num_obs = []
num_mod = []
num_gen = []
for key in keys[0:nk]:
ind = N.where(obs==key)[0]
num_obs.append(len(ind))
ind = N.where(mod==key)[0]
num_mod.append(len(ind))
ind = N.where(gen==key)[0]
num_gen.append(len(ind))
num_obs = N.array(num_obs)
num_mod = N.array(num_mod)
num_gen = N.array(num_gen)
ax = P.subplot(311)
P.bar(N.arange(nk),100.*num_obs/N.sum(num_obs),0.8,color='dodgerblue',ec='k',label='obs')
P.xticks(N.arange(nk-12),keys[0:nk-12],rotation=45)
P.xlim(-1,30)
P.ylim(0,20)
P.ylabel('Anteil [%]',weight='bold')
P.legend(loc=2)
ax.tick_params(direction='out')
ax = P.subplot(312)
P.bar(N.arange(nk),100.*num_mod/N.sum(num_mod),0.8,color='dodgerblue',ec='k',label='mod')
P.xticks(N.arange(nk-12),keys[0:nk-12],rotation=45)
P.xlim(-1,30)
P.ylim(0,20)
P.ylabel('Anteil [%]',weight='bold')
P.legend(loc=2)
ax.tick_params(direction='out')
ax = P.subplot(313)
P.bar(N.arange(nk),100.*num_gen/N.sum(num_gen),0.8,color='dodgerblue',ec='k',label='gen')
P.xticks(N.arange(nk-12),keys[0:nk-12],rotation=45)
P.xlim(-1,30)
P.ylim(0,20)
P.ylabel('Anteil [%]',weight='bold')
P.legend(loc=2)
ax.tick_params(direction='out')
P.tight_layout()
P.savefig('./img/gen_%i-%i.png'%(ja,je),dpi=240,transparent=False,bbox_inches='tight',pad_inches=0.0)
return
Running¶
ja=1990;je=2019;nj=1000
gwlgen(ja,je,nj)
#ja=1990;je=2019;nj=1000
#gwlgen(ja,je,nj)