Commit d8be18d0 authored by Eric Dagobert's avatar Eric Dagobert

Initial commit

parents
This diff is collapsed.
import configparser
import numpy as np
class ConfigReader:
def __init__(self,path):
self._path = path
self._config = configparser.ConfigParser()
self._config.read(path)
def scavroot(self):
return self._config['scav']['root']
def dates(self):
x = np.datetime64(self._config['scav']['datefrom']).astype(object)
y = np.datetime64(self._config['scav']['dateto']).astype(object)
return x,y
def scavsuffix(self):
return self._config['scav']['filetype']
def scavformat(self):
return self._config['scav']['fileformat']
def scavout(self):
return self._config['scav']['fileout']
def scavlistfield(self):
return self._config['scav']['listfields']
def distrmastersource(self) :
return self._config['distributions']['mastersource']
def distrfields(self):
return self._config['distributions']['fields'].split(',')
def distrws(self):
return int(self._config['distributions']['windowsize'])
def distroverlap(self):
return int(self._config['distributions']['overlap'])
def mastercovlist(self):
return self._config['distributions']['mastercovlist']
def clusterN(self):
return int(self._config['clusters']['N'])
def claffinity(self):
return int(self._config['clusters']['affinity'])
def cleigen_solver(self):
return int(self._config['clusters']['eigen_solver'])
def cln_neighbors(self):
return int(self._config['clusters']['n_neighbors'])
def clrandom_states(self):
return int(self._config['clusters']['random_states'])
def smooth_hellinger(self):
return bool(self._config['clusters']['smooth hellinger'])
if __name__ == "__main__":
c = ConfigReader('regimes.ini')
print (c.scavlistfield())
\ No newline at end of file
#!/usr/bin/python3 -O
from utils import ProgressBar
from config_manager import ConfigReader
import pandas
import numpy as np
from sklearn.covariance import GraphicalLassoCV, LedoitWolf
import dill
import sys
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
class DistList:
def __init__(self, covindex,chunks,covlist,ilist):
self._covindex = covindex
self._chunks = chunks
self._covlist = covlist
self._ilist = ilist
self._index = None
class Distributions:
def __init__(self,varfact=1.1,eps=.2, window=90, overlap=60):
self._varfact=varfact
self.eps=eps
self._window=window
self._overlap=overlap
def compute_gaussians(self,window,overlap):
i = 0
r = None
k = 0
# i=
ret = None
rw=[]
avdi = np.full((self._N,),0.)
ilist=[]
pr = ProgressBar(self._logret.shape[0]/(window-overlap), prefix='Covariances Computed', suffix='Complete')
while i < self._logret.shape[0]:
iend = min(i+window,self._logret.shape[0])
chunk = self._logret[i:iend,:]#*100.
ilist.append((i,iend))
rw.append(chunk)
pr.iterate()
mu = chunk.mean(axis=0)
model = GraphicalLassoCV(cv=3)
model.fit(chunk.astype('float64'))
cov = model.covariance_
di = np.diag(cov)
avdi += di
np.fill_diagonal(cov,di*self._varfact)
# check spd
# ch = np.linalg.cholesky(cov)
r = None
r_ = np.hstack((mu.reshape(-1,1),cov))
if ret is None:
ret = r_.reshape(1,r_.shape[0],r_.shape[1])
else:
ret = np.vstack((ret,r_.reshape(1,r_.shape[0],r_.shape[1])))
if iend < self._logret.shape[0]:
i = iend-overlap
else:
break
dlist = DistList( np.arange(ret.shape[0]), rw,ret,ilist)
return dlist
def load(self,mfile,mfields):
pd = pandas.read_csv(mfile,sep=',')
self._index = pd['Date'].values
self._logret = pd[mfields].values
self._N = self._logret.shape[1]
def main():
creader = ConfigReader('regimes.ini')
mfile = creader.distrmastersource()
mfields = creader.distrfields()
D = Distributions(varfact=1.1,eps=.2, window=90, overlap=60)
D.load(mfile, mfields)
window = creader.distrws()
overlap = creader.distroverlap()
dlist = D.compute_gaussians(window, overlap)
dlist._index=D._index
mcovf = creader.mastercovlist()
print()
print('list of covariances saved in..')
with open(mcovf, 'wb') as f:
dill.dump(dlist,f)
print (mcovf)
if __name__ == "__main__":
main()
\ No newline at end of file
#!/usr/bin/python3 -O
import sys
from datetime import datetime
from distribs import DistList
from config_manager import ConfigReader
import dill
import os.path
from utils import ProgressBar
import numpy as np
from math import *
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from sklearn import cluster, covariance, manifold
class Clusters:
def __init__(self,conf):
self._conf = conf
self._NCLUSTERS = conf.clusterN()
self._THRESHOLD = 1
distfile = conf.mastercovlist()
with open(distfile,'rb') as f:
self._distlist = dill.load(f)
def spectral_centroids(self):
self._centroids=[]
for i in range(self._clabels.max()+1):
idx=self._clabels==i
mu = self._embedding[:,idx].mean(axis=1).reshape(2,1)
#find the closest point to mu
dist = np.square(self._embedding-mu).sum(axis=0)
i2 = np.argmin(dist)
covs = self._distlist._covlist
self._centroids.append((self._embedding[:,i2],i2, self._embedding[:,idx].shape[1], covs[i2]))
def graph_clusters(self):
t = 'clusters'
embedding = self._embedding
centroids = self._centroids
L = self._clabels
colormap = [cm.gnuplot2(i) for i in np.linspace(0.,1.,max(L)+2)]
colorms = [colormap[i] for i in L]
fig = plt.figure()
plt.scatter(embedding[0,:],embedding[1,:],s=10,c=colorms,alpha=0.8)
processed=set()
for i,col in enumerate(range(max(L)+1)):
if col in processed:
continue
lxy=centroids[i][0]
labelx,labely = lxy[0],lxy[1]
textx = labelx + 0.004
texty = labely + 0.01
plt.annotate(str(col),xy=(labelx,labely),xytext=(textx,texty),
arrowprops=dict(arrowstyle="->",connectionstyle="arc3"),size=6)
for i,z in enumerate(centroids):
plt.scatter(z[0][0],z[0][1],s=50,c=[colormap[i]],alpha=1.0)
plt.title(t)
plt.savefig('clusters')
plt.close(fig)
def hellinger(self,n,x,y,smooth,**kwargs):
x1 = x.reshape(n,n+1)
x2 = y.reshape(n,n+1)
mu1,sigma1=x1[:,0],x1[:,1:]
mu2,sigma2=x2[:,0],x2[:,1:]
sign1, ld1 = np.linalg.slogdet(sigma1)
sign2, ld2 = np.linalg.slogdet(sigma2)
dd = 0.5*(sigma1+sigma2)
signdd, ldd = np.linalg.slogdet(dd)
signt = (sign1 * sign2)/signdd
if signt <= 0:
print ('negative')
s_1 = np.linalg.inv(dd)
e = -0.125*(mu1-mu2).T.dot(s_1).dot(mu1-mu2)
#d = pow(d1,.25)*pow(d2,.25)/pow(det1,.5)
d = .25*(ld1+ld2-2*ldd)
d = exp(d)
es = 1-d*exp(e)
#if es <=0:
# es = 0
if smooth:
r = atanh(es)
else:
r = sqrt(es)
return r
def compute_clusters(self):
nn = self._conf.clspectral_neighbors()
node_position_model = manifold.SpectralEmbedding(n_components=2,n_neighbors=nn,random_state=42)
self._embedding = node_position_model.fit_transform(self._hellingerd.T).T
cind = self._distlist._covindex
X = self._embedding.astype('float64').T
N = self._conf.clusterN()
aff = self._conf.claffinity()
nn = self._conf.cln_neighbors()
rs = self._conf.clrandom_states()
es = self._conf.cleigen_solver()
self._agg = cluster.SpectralClustering(n_clusters=N,affinity=aff,eigen_solver=es,n_neighbors=nn,random_state=rs).fit(X)
self._clabels=self._agg.labels_
with open('clabels.plk','wb') as f:
dill.dump(self._clabels,f)
with open('clusterengine.plk','wb') as f:
dill.dump(self._agg,f)
self.spectral_centroids()
def cal_affinity_matrix(self):
if (os.path.isfile('hellinger.plk')):
self._hellingerd = dill.load(open('hellinger.plk','rb'))
return
cind = self._distlist._covindex
covs = self._distlist._covlist
hellingerd = np.full((covs.shape[0],covs.shape[0]),0.)
n = covs.shape[1]
br = ProgressBar(covs.shape[0]*covs.shape[0], prefix='Hellinger Affinities', suffix='Completed')
for u in cind:
for v in cind:
br.iterate()
if hellingerd[v,u] == 0:
hellingerd[u,v] = self.hellinger(n,covs[u],covs[v],self._conf.smooth_hellinger())
else:
hellingerd[u,v] = hellingerd[v,u]
self._hellingerd = hellingerd
with open('hellinger.plk','wb') as f:
dill.dump(hellingerd,f)
if __name__ == "__main__":
conf = ConfigReader('regimes.ini')
cl = Clusters(conf)
cl.cal_affinity_matrix()
cl.compute_clusters()
cl.graph_clusters()
\ No newline at end of file
[scav]
root=/home/eric/src/Data/axioma
datefrom=1982-01-01
dateto=1999-02-01
filetype=ret
fileformat=AXUS4-MH.
fileout=scavfactors.csv
listfields =United States,Value,Volatility,Aerospace & Defense
[distributions]
mastersource=RFreturns.csv
fields=ComEnOil,ComMeGol,CurEUR,CurJPY,EqCHN,EqEME,EqGrVaUSA,EqJPN,EqLaSmUSA,EqSeFinUSA,EqSeITeUSA,EqUSA,EqWLD,FICrHAAAUSA,FICrHBAAUSA,FICreHyUSA,FIG10YUSA,FIG3MUSA
windowsize=60
overlap=40
mastercovlist=mastercovlist.pkl
[clusters]
N=8
affinity=nearest_neighbors
eigen_solver=arpack
spectraln_neighbors=6
n_neighbors=10
random_states=48
smooth hellinger=True
\ No newline at end of file
#!/usr/bin/python3 -O
import glob
import os
from datetime import datetime
import numpy as np
import sys
import csv
from config_manager import ConfigReader
_creader = ConfigReader('regimes.ini')
_currdir='.'
def browse():
global _creader,_currdir
_currdir = os.getcwd()
os.chdir(_creader.scavroot())
suff = '.' + _creader.scavsuffix()
pref = _creader.scavformat()
f,t = _creader.dates()
for dt in np.arange(f, t, dtype='datetime64[M]'):
pdir=np.datetime_as_string(dt,unit='M').replace('-','/')
if os.path.exists(pdir):
for file in os.listdir(pdir):
if file.endswith(suff) and file.startswith(pref):
yield os.path.join(pdir,file)
def create_csv(header):
ofile = os.path.join(_currdir,_creader.scavout())
fcsv = open(ofile,'w')
print(', '.join(header),file=fcsv, flush=True)
return fcsv
def build_output():
first = True
rows = None
print ('processing files ..')
lfield = _creader.scavlistfield().split(',')
for i, f in enumerate(browse()):
with open(f,'r') as fd:
row = []
header = ['Date']
print('\r%2d'%i,flush=True,end='')
for lines in fd.readlines():
if lines.startswith('#'):
if lines.startswith('#DataDate'):
asof = np.datetime64(lines.split(':')[1].strip())
row = [asof]
continue
a = lines.split('|')
if (len(lfield) > 0 and a[0] in lfield ) or lfield is None:
row.append(float(a[1]))
header.append(a[0])
if first:
fcsv = create_csv(header)
first=False
if rows is None:
rows = np.array(row)
else:
rows = np.vstack((rows,row))
rows = rows.T
dsort=np.argsort(rows[0,:])
print ('done')
print ('flushing')
for ii, i in enumerate(np.argsort(rows[0,:])):
print('\r%2d'%i,flush=True,end='')
print(np.datetime_as_string(rows[0,i],unit='D'),',',', '.join(map(str,rows[1:,i])),file=fcsv)
print('done')
fcsv.close()
if __name__ == "__main__":
build_output()
#t,d,T = get_factors([1,2,3],datefrom='2007-01-31')
#print (T)
This diff is collapsed.
This diff is collapsed.
from time import sleep
class ProgressBar:
def __init__(self, total, prefix='',suffix='',decimals=1,length=100, fill = '█'):
self._total = total
self._iteration=0
self._prefix = prefix
self._prefix = prefix
self._suffix = suffix
self._decimals = decimals
self._length = length
self._fill = fill
self._percent=0
def _display(self):
self._percent = ("{0:." + str(self._decimals) + "f}").format(100 * (self._iteration / float(self._total)))
filledLength = int(self._length * self._iteration // self._total)
bar = self._fill * filledLength + '-' * (self._length - filledLength)
print('\r%s |%s| %s%% %s' % (self._prefix, bar, self._percent, self._suffix), end = '\r')
# Print New Line on Complete
if self._iteration == self._total:
print()
def iterate(self, inc=1):
sleep(0)
self._iteration += inc
self._display()
if __name__ == "__main__":
pr=ProgressBar(50,prefix='Progress',suffix='Complete',length=50)
for i in range(0,50):
pr.iterate()
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment