Entscheidungsmodell

Trainiertes Entscheidungmodell zwischen Störmeldungen und Großwetterlagen

Methode¶

python library “scikit-learn”
non-parametric supervised learning method
The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features

Leseprobe¶

Die Kritikalität von Wettermustern auf Schadensmuster werden durch einen Machine-Learning Algorithmus “decision tree” realsiert. Dieser lernt aus Trainingsdaten und leitet daraus Regeln für die Anwendung auf Testdaten ab.

Die Zielgröße sind tägliche Störmeldungen in den 34 PDs der Bahn von 2015-2019. Die Unabhängigen Variablen sind die Monate und die Großwetterlagen. Diese gelten für alle PDs gleichermaßen. Weitere meteorologische Größen sind optional.

Die sortierte Jahressumme der Meldungen je PD dient der Orientiertung. Der Anteil der Meldungen, welche durch Großwetterlagen erklärt werden können ist kleiner, aber folgt dem tatsächlichen Verlauf.

Mittels Testdaten, d.h. Daten zu Großwetterlagen aus der Vergangenheit oder aus Klimamodellen, lassen sich Aussagen über Entwicklungen ableiten.

Ergebnis¶

Code¶

Python

Importing

import sys
import os
import matplotlib
matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab!
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from mpl_toolkits.basemap import shiftgrid
from matplotlib.path import Path
from matplotlib.patches import PathPatch
import numpy
import shapefile
import numpy as N
import scipy.ndimage
from scipy.ndimage.filters import gaussian_filter
import matplotlib.mlab as ml
from scipy.interpolate import griddata
from scipy import interpolate
import math
from scipy import signal,stats
from scipy.ndimage.filters import uniform_filter
import string
from matplotlib.collections import PatchCollection
from pylab import Polygon
import matplotlib as mpl
import pandas as pd
from sklearn import tree
from sklearn import metrics
from sklearn.externals.six import StringIO 
from sklearn.metrics import accuracy_score

import pydotplus # To create our Decision Tree Graph
from IPython.display import Image  # To Display a image of our graph

Setting

plt.style.use('bmh')
params = {'legend.fontsize': 8,'font.family': 'serif'}
plt.rcParams.update(params)

Defining

def deg2num(lat_deg, lon_deg, zoom):
  lat_rad = math.radians(lat_deg)
  n = 2.0 ** zoom
  xtile = int((lon_deg + 180.0) / 360.0 * n)
  ytile = int((1.0 - math.log(math.tan(lat_rad) + (1 / math.cos(lat_rad))) / math.pi) / 2.0 * n)
  return (xtile, ytile)

def moving_summe(a,n=3):
    ret=N.cumsum(a,dtype=float)
    ret[n:]=ret[n:]-ret[:-n]
    return ret[n-1:] 

def moving_average(a,n=3):
    ret=N.cumsum(a,dtype=float)
    ret[n:]=ret[n:]-ret[:-n]
    return ret[n-1:]/n

Reading

file = '../dat/gwlneudatum.dat' 

jj=N.genfromtxt(file,usecols=(2),skip_header=0,dtype="I")
mm=N.genfromtxt(file,usecols=(1),skip_header=0,dtype="I")
dd=N.genfromtxt(file,usecols=(0),skip_header=0,dtype="I")
gw=N.genfromtxt(file,usecols=(3),skip_header=0,dtype=str)

ind = N.where((jj>=2015)&(jj<=2019))[0]

gwls = gw[ind]
nd = len(ind)

Processing

gcms = ['MPI-ESM-LR','MIROC5','HadGEM2-ES','CNRM-CM5','IPSL-CM5A-MR','NorESM1-M','EC-EARTH','CanESM2']
gcms = ['MPI-ESM-LR','MIROC5','CNRM-CM5','IPSL-CM5A-MR','NorESM1-M','EC-EARTH']
gcms = ['MPI-ESM-LR','MIROC5','CNRM-CM5','IPSL-CM5A-MR','NorESM1-M']

ng = len(gcms)

peri = [1961,1981,2000,2021,2041,2061]
np = len(peri)

krit = N.zeros((ng,34,np,2),float);krit[:,:,:,:] = N.nan

filter = 'Filter1'

m = Basemap(projection='merc',llcrnrlat=47,urcrnrlat=55.2,llcrnrlon=5,urcrnrlon=16,resolution='l')
m.readshapefile('../shp/OSS_M1_PROD_DURCH_FL_EXT_geo', 'pds',zorder=10)

obs1 = []
sim1 = []

obs2 = []
sim2 = []

nam = []

xs = []
ys = []
zs = []

k = -1

for info, shape in zip(m.pds_info, m.pds):

    k = k+1

    p = info['NAME']

    nam.append(p.decode('utf-8'))

    file = '../csv/pds-uas/'+p+'.csv'
    data=N.genfromtxt(file,names=True,comments='#',delimiter=';',dtype=None,encoding='utf-8')
    id = N.where((data['ja']>=2015)&(data['ja']<=2019))[0]

    bahn1 = data['Filter1'][id]
    bahn2 = data['Filter2'][id]

    file = '../csv/obs-dwd/'+p+'.csv'
    data=N.genfromtxt(file,names=True,comments='#',delimiter=';',dtype=None,encoding='utf-8')
    id = N.where((data['ja']>=2015)&(data['ja']<=2019))[0]

    mona = data['mo'][id]
    tmax = data['tmax'][id]
    nied = data['nied'][id]
    wmax = data['wmax'][id]

    for i in mona:

        xs.append(i)

    mo = N.zeros(nd,str)

    mo[mona==1]  = 'Jan'        
    mo[mona==2]  = 'Feb'
    mo[mona==3]  = 'Mar'
    mo[mona==4]  = 'Apr'
    mo[mona==5]  = 'May'
    mo[mona==6]  = 'Jun'
    mo[mona==7]  = 'Jul'
    mo[mona==8]  = 'Aug'
    mo[mona==9]  = 'Sep'
    mo[mona==10] = 'Okt'
    mo[mona==11] = 'Nov'
    mo[mona==12] = 'Dec'

    tx = N.zeros(nd,str)

    tx[tmax>=30]  = 'tmax30'        
    tx[tmax<30]  = 'normal'

    pr = N.zeros(nd,str)

    pr[nied>=20]  = 'nied20'        
    pr[nied<20]  = 'normal'

    wx = N.zeros(nd,str)

    wx[wmax>=25]  = 'wmax25'        
    wx[wmax<25]  = 'normal'

    gw = N.array(gwls,str)

    ua1 = bahn1
    ua2 = bahn2  

    df = pd.DataFrame()

    df['mona'] = mo
    df['wmax'] = wx
    df['tmax'] = tx
    df['nied'] = pr
    df['gwls'] = gw
    df['bahn1'] = ua1
    df['bahn2'] = ua2

    obs1.append(N.sum(ua1))
    obs2.append(N.sum(ua2))

    for i in ua1:

        ys.append(i)

    one_hot_data = pd.get_dummies(df[ ['gwls','mona'] ])    

    clf = tree.DecisionTreeClassifier(criterion="gini")

    clf_train = clf.fit(one_hot_data,df['bahn1'])

    pred = clf_train.predict(one_hot_data)
    sim1.append(N.sum(pred))

    for i in pred:

        zs.append(i)

    clf_train = clf.fit(one_hot_data,df['bahn2'])
    pred = clf_train.predict(one_hot_data)
    sim2.append(N.sum(pred))

    ig = -1

    for gcm in gcms:

        ig = ig+1

        if(gcm!='HESS-BREZ'):

            file = '../dat/'+gcm+'.dat'

            tt=N.genfromtxt(file,usecols=(0),skip_header=0,delimiter=';',dtype=str)
            gg=N.genfromtxt(file,usecols=(3),skip_header=0,delimiter=';',dtype=str)

            nt = len(tt)

            mm = []
            jj = []

            for t in range(nt):

               jj.append(int(tt[t][0:4]))
               mm.append(int(tt[t][5:7]))  

            mm = N.array(mm)
            jj = N.array(jj)

        else:

             file = '../dat/gwlneudatum.dat'

             mm=N.genfromtxt(file,usecols=(1),skip_header=0,dtype='i')
             jj=N.genfromtxt(file,usecols=(2),skip_header=0,dtype='i')
             gg=N.genfromtxt(file,usecols=(3),skip_header=0,dtype=str)

        if((gcm=='HESS-BREZ')|(gcm=='NCEP-NCAR')): np = 3
        else: np = len(peri)

        for ip in range(np):

            ja = peri[ip]
            je = ja+19

            id = N.where((jj>=ja)&(jj<=je))[0]

            gx = gg[id]
            mx = mm[id]
            jx = jj[id]

            nt = len(id)

            mo = N.zeros(nt,str)

            mo[mx==1]  = 'Jan'        
            mo[mx==2]  = 'Feb'
            mo[mx==3]  = 'Mar'
            mo[mx==4]  = 'Apr'
            mo[mx==5]  = 'May'
            mo[mx==6]  = 'Jun'
            mo[mx==7]  = 'Jul'
            mo[mx==8]  = 'Aug'
            mo[mx==9]  = 'Sep'
            mo[mx==10] = 'Okt'
            mo[mx==11] = 'Nov'
            mo[mx==12] = 'Dec'

            df0 = pd.DataFrame()

            df0['mona'] = mo
            df0['gwls'] = gx

            clf_train = clf.fit(one_hot_data,df['bahn1'])

            one_hot_data1 = pd.get_dummies(df0[ ['gwls','mona'] ])

            pred = clf_train.predict(one_hot_data1)

            krit[ig,k,ip,0] = N.sum(pred)

            clf_train = clf.fit(one_hot_data,df['bahn2'])

            pred = clf_train.predict(one_hot_data1)

            krit[ig,k,ip,1] = N.sum(pred)

obs1 = N.array(obs1)
sim1 = N.array(sim1)

obs2 = N.array(obs2)
sim2 = N.array(sim2)

nam = N.array(nam)

xs = N.array(xs)
ys = N.array(ys)
zs = N.array(zs)

Plotting

fig = plt.figure(figsize=(12,10))#,frameon=False)

plt.subplot(221)
pds = N.arange(34)

id = N.argsort(obs1)

plt.plot(obs1[id]/5.,pds,'k',label='obs')
plt.plot(sim1[id]/5.,pds,'b',label='sim')

i = -1

for gcm in gcms:

    i = i+1

    plt.plot(krit[i,id,0,0]/20.,pds,'g',label=gcm,lw=0.5)
    plt.plot(krit[i,id,1,0]/20.,pds,'orange',lw=0.5)
    plt.plot(krit[i,id,2,0]/20.,pds,'red',lw=0.5)

plt.title('Entscheidungsmodell: Meldungen ~ P(Monat,GWL)',fontsize=12,weight='bold')
plt.xlabel('Meldungen (Filter1)',fontsize=14,weight='bold')
plt.yticks(pds,nam[id],fontsize=10)
plt.ylim(-1,34)
plt.legend(loc=4,shadow='true')

plt.subplot(222)

id = N.argsort(obs2)

plt.plot(obs2[id]/5.,pds,'k',label='obs')
plt.plot(sim2[id]/5.,pds,'b',label='sim')

i = -1

for gcm in gcms:

    i = i+1

    plt.plot(krit[i,id,0,1]/20.,pds,'g',label=gcm,lw=0.5)
    plt.plot(krit[i,id,1,1]/20.,pds,'orange',lw=0.5)
    plt.plot(krit[i,id,2,1]/20.,pds,'red',lw=0.5)

plt.title('Entscheidungsmodell: Meldungen ~ P(Monat,GWL)',fontsize=12,weight='bold')
plt.xlabel('Meldungen (Filter2)',fontsize=14,weight='bold')
plt.yticks(pds,nam[id],fontsize=10)
plt.ylim(-1,34)
plt.legend(loc=4,shadow='true')

plt.subplot(223)

i = -1

for gcm in gcms:

    i = i+1

    tmp = N.sum(krit[i,:,:,0],0)/30.-N.sum(krit[i,:,0,0],0)/30.

    if((gcm=='HESS-BREZ')|(gcm=='NCEP-NCAR')): x = 2
    else: x=len(peri)-1

    tmp = N.cumsum(tmp-tmp[2])

    plt.plot(peri,tmp,'r-o',label=gcm,lw=0.5)
    plt.text(peri[x]+5,tmp[x],gcm,va='center')

plt.ylabel('Meldungen (Filter1)',fontsize=14,weight='bold')
plt.xticks(peri)
plt.xlim(1951,2100)
plt.ylim(-3000,3000)

plt.subplot(224)

i = -1

for gcm in gcms:

    i = i+1

    tmp = N.sum(krit[i,:,:,1],0)/30.-N.sum(krit[i,:,0,1],0)/30.

    if((gcm=='HESS-BREZ')|(gcm=='NCEP-NCAR')): x=2
    else: x = len(peri)-1

    tmp = N.cumsum(tmp-tmp[2])

    plt.plot(peri,tmp,'r-o',label=gcm,lw=0.5)
    plt.text(peri[x]+5,tmp[x],gcm,va='center')

plt.ylabel('Meldungen (Filter2)',fontsize=14,weight='bold')
plt.xticks(peri)
plt.xlim(1951,2100)
plt.ylim(-200,200)

plt.tight_layout()
plt.savefig('./img/decision.png',dpi=240,bbox_inches='tight',pad_inches=0.0)#,transparent='true')

Plotting

fig = plt.figure(figsize=(12,5))#,frameon=False)

for k in [1,2]:

    plt.subplot(1,2,k)

    for sw in [0,20]:#,40]:

        if(k==1): yy = ys
        if(k==2): yy = zs

        id = N.where(yy>sw)[0]

        xx = xs[id]
        yy = yy[id]

        values = N.vstack([xx,yy])

        print (values.shape)

        kernel = stats.gaussian_kde(values)
        X, Y = N.mgrid[0:13:100j,0:100:200j]
        positions = N.vstack([X.ravel(),Y.ravel()])
        Z2 = N.reshape(kernel(positions).T, X.shape)

        lev = N.arange(0.5,10,0.5)

        CF=plt.contourf(X,Y,1000*Z2,levels=lev,cmap=plt.cm.YlOrRd,extend='max',zorder=1)
        plt.contour(X,Y,1000*Z2,levels=lev,colors='k',linewidths=0.5,zorder=1)

    plt.text(2,10,'N>0',size=14,weight='bold',ha='center',va='center',bbox=dict(boxstyle="circle", ec='k', fc='gray',alpha=0.5,lw=2))
    plt.text(2,30,'N>20',size=14,weight='bold',ha='center',va='center',bbox=dict(boxstyle="circle", ec='k', fc='gray',alpha=0.5,lw=2))
    #plt.text(2,50,'N>40',size=14,weight='bold',ha='center',va='center',bbox=dict(boxstyle="circle", ec='k', fc='gray',alpha=0.5,lw=2))

    plt.tick_params(direction='out')

    plt.xlim(0.5,12.5)
    plt.xticks([1,2,3,4,5,6,7,8,9,10,11,12],['Jan','Feb','Mar','Apr','Mai','Jun','Jul','Aug','Sep','Okt','Nov','Dez'])
    plt.ylim(0,50)

    plt.grid(color='k')
    plt.ylabel('Anzahl',fontsize=16,weight='bold')

    if(k==1): plt.title('Bahn ('+filter+') ~ Monat: Beobachtung',fontsize=14,weight='bold')
    if(k==2): plt.title('Bahn ('+filter+') ~ Monat: Simulation',fontsize=14,weight='bold')

plt.savefig('./img/season.png',dpi=240,bbox_inches='tight',pad_inches=0.0)