Initial commit

d8be18d0 · Eric Dagobert · d8be18d0 · d8be18d0 · d8be18d0 · d8be18d0
Commit d8be18d0 authored Jan 02, 2019 by Eric Dagobert
11 changed files
--- a/RFreturns.csv
+++ b/RFreturns.csv
--- a/config_manager.py
+++ b/config_manager.py
+import configparser
+import numpy as np
+
+class ConfigReader:
+    def __init__(self,path):
+        self._path = path
+        self._config = configparser.ConfigParser()
+        self._config.read(path)
+        
+        
+    def scavroot(self):
+        return self._config['scav']['root']
+    
+    def dates(self):
+        x = np.datetime64(self._config['scav']['datefrom']).astype(object)
+        y = np.datetime64(self._config['scav']['dateto']).astype(object)
+        return x,y
+    
+    def scavsuffix(self):
+        return self._config['scav']['filetype']
+    
+    def scavformat(self):
+        return self._config['scav']['fileformat']
+    
+    def scavout(self):
+        return self._config['scav']['fileout']
+    
+    def scavlistfield(self):
+        return self._config['scav']['listfields']
+    
+    def distrmastersource(self) :
+        return self._config['distributions']['mastersource']
+    
+    def distrfields(self):
+        return self._config['distributions']['fields'].split(',')
+    
+    def distrws(self):
+        return int(self._config['distributions']['windowsize'])
+    
+    def distroverlap(self):
+        return int(self._config['distributions']['overlap'])
+    
+    def mastercovlist(self):
+        return self._config['distributions']['mastercovlist']
+    
+    def clusterN(self):
+        return int(self._config['clusters']['N'])
+    def claffinity(self):
+        return int(self._config['clusters']['affinity'])
+    def cleigen_solver(self):
+        return int(self._config['clusters']['eigen_solver'])
+    def cln_neighbors(self):
+        return int(self._config['clusters']['n_neighbors'])  
+    def clrandom_states(self):
+        return int(self._config['clusters']['random_states'])  
+    
+    def smooth_hellinger(self):
+        return bool(self._config['clusters']['smooth hellinger'])
+    
+if __name__ == "__main__":
+    c = ConfigReader('regimes.ini')
+    print (c.scavlistfield())
\ No newline at end of file
--- a/distribs.py
+++ b/distribs.py
+#!/usr/bin/python3  -O
+from utils import ProgressBar
+from config_manager import ConfigReader
+import pandas
+import numpy as np
+from sklearn.covariance import GraphicalLassoCV, LedoitWolf
+import dill
+import sys
+if not sys.warnoptions:
+    import warnings
+    warnings.simplefilter("ignore")
+    
+class DistList:
+    def __init__(self, covindex,chunks,covlist,ilist):
+        self._covindex = covindex
+        self._chunks = chunks
+        self._covlist = covlist
+        self._ilist = ilist    
+        self._index = None
+
+class Distributions:
+    def __init__(self,varfact=1.1,eps=.2, window=90, overlap=60): 
+        self._varfact=varfact
+        self.eps=eps
+        self._window=window
+        self._overlap=overlap
+        
+    def compute_gaussians(self,window,overlap):
+        i = 0
+        r = None
+        k = 0
+        # i= 
+        ret = None
+        rw=[]
+        avdi = np.full((self._N,),0.)
+        ilist=[]
+        pr = ProgressBar(self._logret.shape[0]/(window-overlap), prefix='Covariances Computed', suffix='Complete')
+        while i < self._logret.shape[0]:
+            iend = min(i+window,self._logret.shape[0])                
+            chunk = self._logret[i:iend,:]#*100.
+            ilist.append((i,iend))
+            rw.append(chunk)
+            pr.iterate()
+            
+            mu = chunk.mean(axis=0)
+            model = GraphicalLassoCV(cv=3)
+            model.fit(chunk.astype('float64'))
+            cov = model.covariance_
+            di = np.diag(cov)
+            avdi += di
+            np.fill_diagonal(cov,di*self._varfact)                
+            # check spd 
+            #    ch = np.linalg.cholesky(cov) 
+            r = None
+            r_ = np.hstack((mu.reshape(-1,1),cov))
+            if ret is None:
+                ret = r_.reshape(1,r_.shape[0],r_.shape[1])          
+            else:
+                ret = np.vstack((ret,r_.reshape(1,r_.shape[0],r_.shape[1])))
+            if iend < self._logret.shape[0]:
+                i = iend-overlap        
+            else:
+                break
+            
+        dlist  = DistList( np.arange(ret.shape[0]), rw,ret,ilist)
+        return dlist
+    
+    def load(self,mfile,mfields):
+        pd = pandas.read_csv(mfile,sep=',')
+        self._index = pd['Date'].values
+        self._logret = pd[mfields].values
+        self._N = self._logret.shape[1]
+       
+def main():
+    creader = ConfigReader('regimes.ini')
+    mfile = creader.distrmastersource()
+    mfields = creader.distrfields()
+    D = Distributions(varfact=1.1,eps=.2, window=90, overlap=60)
+    D.load(mfile, mfields)
+    window = creader.distrws()
+    overlap = creader.distroverlap()
+    dlist = D.compute_gaussians(window, overlap)
+    dlist._index=D._index
+    
+    mcovf = creader.mastercovlist()
+    print()
+    print('list of covariances saved in..')
+    with open(mcovf, 'wb') as f:
+        dill.dump(dlist,f)    
+    print (mcovf)
+        
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/makeclusters.py
+++ b/makeclusters.py
+#!/usr/bin/python3  -O
+import sys
+from datetime import datetime
+from distribs import DistList
+from config_manager import ConfigReader
+import dill
+import os.path
+from utils import ProgressBar
+import numpy as np
+from math import *
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import cm
+from sklearn import cluster, covariance, manifold
+
+
+
+class Clusters:
+    
+    def __init__(self,conf):
+        self._conf = conf
+        self._NCLUSTERS = conf.clusterN()
+        self._THRESHOLD = 1
+        distfile = conf.mastercovlist()
+        with open(distfile,'rb') as f:
+            self._distlist = dill.load(f)
+            
+    def spectral_centroids(self):
+        self._centroids=[]
+        for i in range(self._clabels.max()+1):
+            idx=self._clabels==i
+            mu = self._embedding[:,idx].mean(axis=1).reshape(2,1)
+                #find the closest point to mu
+            dist = np.square(self._embedding-mu).sum(axis=0)
+            i2 = np.argmin(dist)
+            covs = self._distlist._covlist
+            self._centroids.append((self._embedding[:,i2],i2, self._embedding[:,idx].shape[1], covs[i2]))
+          
+    def graph_clusters(self):
+        t = 'clusters'
+        embedding = self._embedding
+        centroids = self._centroids
+        L = self._clabels
+       
+        colormap = [cm.gnuplot2(i) for i in np.linspace(0.,1.,max(L)+2)]
+        colorms = [colormap[i] for i in L]
+
+        fig = plt.figure()
+        plt.scatter(embedding[0,:],embedding[1,:],s=10,c=colorms,alpha=0.8)   
+
+        processed=set()
+
+        for i,col in enumerate(range(max(L)+1)):
+            if col in processed:
+                continue
+            lxy=centroids[i][0]
+            labelx,labely = lxy[0],lxy[1]
+            textx = labelx + 0.004
+            texty = labely + 0.01
+            plt.annotate(str(col),xy=(labelx,labely),xytext=(textx,texty), 
+                         arrowprops=dict(arrowstyle="->",connectionstyle="arc3"),size=6)
+
+        for i,z in enumerate(centroids):
+            plt.scatter(z[0][0],z[0][1],s=50,c=[colormap[i]],alpha=1.0)   
+
+        plt.title(t)    
+        plt.savefig('clusters')
+        plt.close(fig)
+        
+    def hellinger(self,n,x,y,smooth,**kwargs):        
+        x1 = x.reshape(n,n+1)
+        x2 = y.reshape(n,n+1)
+        mu1,sigma1=x1[:,0],x1[:,1:]
+        mu2,sigma2=x2[:,0],x2[:,1:]
+       
+        sign1, ld1 = np.linalg.slogdet(sigma1)
+        sign2, ld2 = np.linalg.slogdet(sigma2)
+       
+        dd = 0.5*(sigma1+sigma2)
+        
+        signdd, ldd = np.linalg.slogdet(dd)
+        
+        signt = (sign1 * sign2)/signdd
+        if signt <= 0:
+            print ('negative')
+        s_1 = np.linalg.inv(dd)
+        e = -0.125*(mu1-mu2).T.dot(s_1).dot(mu1-mu2)
+        #d = pow(d1,.25)*pow(d2,.25)/pow(det1,.5)
+        d = .25*(ld1+ld2-2*ldd)
+        d = exp(d)
+        es = 1-d*exp(e)
+        #if es <=0:
+        #    es = 0
+        if smooth:
+            r = atanh(es)
+        else:
+            r = sqrt(es)
+        return r
+        
+    
+    def compute_clusters(self): 
+        nn = self._conf.clspectral_neighbors()
+        node_position_model = manifold.SpectralEmbedding(n_components=2,n_neighbors=nn,random_state=42)
+        self._embedding = node_position_model.fit_transform(self._hellingerd.T).T
+        cind = self._distlist._covindex  
+        X = self._embedding.astype('float64').T
+        N = self._conf.clusterN()
+        aff = self._conf.claffinity()
+        nn = self._conf.cln_neighbors()
+        rs = self._conf.clrandom_states()
+        es = self._conf.cleigen_solver()
+        self._agg = cluster.SpectralClustering(n_clusters=N,affinity=aff,eigen_solver=es,n_neighbors=nn,random_state=rs).fit(X)
+        self._clabels=self._agg.labels_
+        with open('clabels.plk','wb') as f:
+            dill.dump(self._clabels,f)
+        with open('clusterengine.plk','wb') as f:
+            dill.dump(self._agg,f)
+        self.spectral_centroids()          
+  
+    def cal_affinity_matrix(self):
+        if (os.path.isfile('hellinger.plk')):
+            self._hellingerd = dill.load(open('hellinger.plk','rb'))
+            return
+        cind = self._distlist._covindex
+        covs = self._distlist._covlist        
+        hellingerd = np.full((covs.shape[0],covs.shape[0]),0.)
+        n = covs.shape[1]
+        br = ProgressBar(covs.shape[0]*covs.shape[0], prefix='Hellinger Affinities', suffix='Completed')
+        for u in cind:
+            for v in cind:
+                br.iterate()
+                if hellingerd[v,u] == 0:
+                    hellingerd[u,v] = self.hellinger(n,covs[u],covs[v],self._conf.smooth_hellinger())
+                else:
+                    hellingerd[u,v] = hellingerd[v,u]
+       
+        self._hellingerd = hellingerd
+        with open('hellinger.plk','wb') as f:
+            dill.dump(hellingerd,f)
+       
+        
+        
+if __name__ == "__main__":
+    conf = ConfigReader('regimes.ini')
+    cl = Clusters(conf)
+    cl.cal_affinity_matrix()
+    cl.compute_clusters()
+    cl.graph_clusters()
\ No newline at end of file
--- a/mastercovlist.pkl
+++ b/mastercovlist.pkl
--- a/regimes.ini
+++ b/regimes.ini
+[scav] 
+root=/home/eric/src/Data/axioma
+datefrom=1982-01-01
+dateto=1999-02-01
+filetype=ret
+fileformat=AXUS4-MH.
+fileout=scavfactors.csv
+listfields =United States,Value,Volatility,Aerospace & Defense
+
+[distributions]
+mastersource=RFreturns.csv
+fields=ComEnOil,ComMeGol,CurEUR,CurJPY,EqCHN,EqEME,EqGrVaUSA,EqJPN,EqLaSmUSA,EqSeFinUSA,EqSeITeUSA,EqUSA,EqWLD,FICrHAAAUSA,FICrHBAAUSA,FICreHyUSA,FIG10YUSA,FIG3MUSA
+windowsize=60
+overlap=40
+mastercovlist=mastercovlist.pkl
+
+[clusters]
+N=8
+affinity=nearest_neighbors
+eigen_solver=arpack
+spectraln_neighbors=6
+n_neighbors=10
+random_states=48
+smooth hellinger=True
\ No newline at end of file
--- a/scav.py
+++ b/scav.py
+#!/usr/bin/python3  -O
+import glob
+import os
+from datetime import datetime
+import numpy as np
+import sys
+import csv
+from config_manager import ConfigReader
+
+_creader = ConfigReader('regimes.ini')
+_currdir='.'
+
+def browse(): 
+    global _creader,_currdir
+    _currdir = os.getcwd()
+    os.chdir(_creader.scavroot())
+    suff = '.' + _creader.scavsuffix()
+    pref = _creader.scavformat()
+    f,t = _creader.dates()
+    for dt in np.arange(f, t,  dtype='datetime64[M]'):
+        pdir=np.datetime_as_string(dt,unit='M').replace('-','/')
+        if os.path.exists(pdir):
+            for file in os.listdir(pdir):
+                if file.endswith(suff) and file.startswith(pref):
+                    yield os.path.join(pdir,file)
+
+def create_csv(header):
+    ofile = os.path.join(_currdir,_creader.scavout())
+    fcsv = open(ofile,'w')
+    print(', '.join(header),file=fcsv, flush=True)
+    return fcsv
+
+def build_output():
+    first = True
+    rows = None
+    print ('processing files ..')
+    lfield = _creader.scavlistfield().split(',')
+    for  i, f in enumerate(browse()):
+        with open(f,'r') as fd:
+            row = []
+            header = ['Date']
+            print('\r%2d'%i,flush=True,end='')
+            for lines in fd.readlines():
+                if lines.startswith('#'):
+                    if lines.startswith('#DataDate'):
+                        asof = np.datetime64(lines.split(':')[1].strip())
+                        row = [asof]
+                    continue            
+
+                a = lines.split('|')
+                if (len(lfield) > 0 and a[0] in lfield ) or lfield is None:
+                    row.append(float(a[1])) 
+                    header.append(a[0])
+            if first:
+                fcsv = create_csv(header)
+                first=False 
+            if rows is None:
+                rows = np.array(row)
+            else:
+                rows = np.vstack((rows,row))
+    rows = rows.T
+    dsort=np.argsort(rows[0,:])
+    print ('done')
+    print ('flushing')
+    for ii, i in enumerate(np.argsort(rows[0,:])):
+        print('\r%2d'%i,flush=True,end='')
+        print(np.datetime_as_string(rows[0,i],unit='D'),',',', '.join(map(str,rows[1:,i])),file=fcsv)
+    print('done')
+    fcsv.close()
+        
+
+if __name__ == "__main__":
+    build_output()
+#t,d,T = get_factors([1,2,3],datefrom='2007-01-31')
+#print (T)
--- a/scavfactors.csv
+++ b/scavfactors.csv
--- a/shcavfactors.csv
+++ b/shcavfactors.csv
--- a/utils.py
+++ b/utils.py
+from time import sleep
+
+
+class ProgressBar:
+    def __init__(self, total, prefix='',suffix='',decimals=1,length=100, fill = '█'):
+        self._total = total
+        self._iteration=0
+        self._prefix = prefix
+        self._prefix = prefix
+        self._suffix = suffix
+        self._decimals = decimals
+        self._length = length
+        self._fill = fill
+        self._percent=0
+    
+    def _display(self):
+        self._percent = ("{0:." + str(self._decimals) + "f}").format(100 * (self._iteration / float(self._total)))
+        filledLength = int(self._length * self._iteration // self._total)
+        bar = self._fill * filledLength + '-' * (self._length - filledLength)
+        print('\r%s |%s| %s%% %s' % (self._prefix, bar, self._percent, self._suffix), end = '\r')
+        # Print New Line on Complete
+        if self._iteration == self._total: 
+            print() 
+            
+    def iterate(self, inc=1):
+        sleep(0)
+        self._iteration += inc
+        self._display()
+        
+    
+if __name__ == "__main__":
+    pr=ProgressBar(50,prefix='Progress',suffix='Complete',length=50)
+    for i in range(0,50):
+        pr.iterate()
+        
+       
--- a/worldfactors.csv
+++ b/worldfactors.csv