Commit d8be18d0 authored by Eric Dagobert's avatar Eric Dagobert

Initial commit

parents
This source diff could not be displayed because it is too large. You can view the blob instead.
import configparser
import numpy as np
class ConfigReader:
def __init__(self,path):
self._path = path
self._config = configparser.ConfigParser()
self._config.read(path)
def scavroot(self):
return self._config['scav']['root']
def dates(self):
x = np.datetime64(self._config['scav']['datefrom']).astype(object)
y = np.datetime64(self._config['scav']['dateto']).astype(object)
return x,y
def scavsuffix(self):
return self._config['scav']['filetype']
def scavformat(self):
return self._config['scav']['fileformat']
def scavout(self):
return self._config['scav']['fileout']
def scavlistfield(self):
return self._config['scav']['listfields']
def distrmastersource(self) :
return self._config['distributions']['mastersource']
def distrfields(self):
return self._config['distributions']['fields'].split(',')
def distrws(self):
return int(self._config['distributions']['windowsize'])
def distroverlap(self):
return int(self._config['distributions']['overlap'])
def mastercovlist(self):
return self._config['distributions']['mastercovlist']
def clusterN(self):
return int(self._config['clusters']['N'])
def claffinity(self):
return int(self._config['clusters']['affinity'])
def cleigen_solver(self):
return int(self._config['clusters']['eigen_solver'])
def cln_neighbors(self):
return int(self._config['clusters']['n_neighbors'])
def clrandom_states(self):
return int(self._config['clusters']['random_states'])
def smooth_hellinger(self):
return bool(self._config['clusters']['smooth hellinger'])
if __name__ == "__main__":
c = ConfigReader('regimes.ini')
print (c.scavlistfield())
\ No newline at end of file
#!/usr/bin/python3 -O
from utils import ProgressBar
from config_manager import ConfigReader
import pandas
import numpy as np
from sklearn.covariance import GraphicalLassoCV, LedoitWolf
import dill
import sys
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
class DistList:
def __init__(self, covindex,chunks,covlist,ilist):
self._covindex = covindex
self._chunks = chunks
self._covlist = covlist
self._ilist = ilist
self._index = None
class Distributions:
def __init__(self,varfact=1.1,eps=.2, window=90, overlap=60):
self._varfact=varfact
self.eps=eps
self._window=window
self._overlap=overlap
def compute_gaussians(self,window,overlap):
i = 0
r = None
k = 0
# i=
ret = None
rw=[]
avdi = np.full((self._N,),0.)
ilist=[]
pr = ProgressBar(self._logret.shape[0]/(window-overlap), prefix='Covariances Computed', suffix='Complete')
while i < self._logret.shape[0]:
iend = min(i+window,self._logret.shape[0])
chunk = self._logret[i:iend,:]#*100.
ilist.append((i,iend))
rw.append(chunk)
pr.iterate()
mu = chunk.mean(axis=0)
model = GraphicalLassoCV(cv=3)
model.fit(chunk.astype('float64'))
cov = model.covariance_
di = np.diag(cov)
avdi += di
np.fill_diagonal(cov,di*self._varfact)
# check spd
# ch = np.linalg.cholesky(cov)
r = None
r_ = np.hstack((mu.reshape(-1,1),cov))
if ret is None:
ret = r_.reshape(1,r_.shape[0],r_.shape[1])
else:
ret = np.vstack((ret,r_.reshape(1,r_.shape[0],r_.shape[1])))
if iend < self._logret.shape[0]:
i = iend-overlap
else:
break
dlist = DistList( np.arange(ret.shape[0]), rw,ret,ilist)
return dlist
def load(self,mfile,mfields):
pd = pandas.read_csv(mfile,sep=',')
self._index = pd['Date'].values
self._logret = pd[mfields].values
self._N = self._logret.shape[1]
def main():
creader = ConfigReader('regimes.ini')
mfile = creader.distrmastersource()
mfields = creader.distrfields()
D = Distributions(varfact=1.1,eps=.2, window=90, overlap=60)
D.load(mfile, mfields)
window = creader.distrws()
overlap = creader.distroverlap()
dlist = D.compute_gaussians(window, overlap)
dlist._index=D._index
mcovf = creader.mastercovlist()
print()
print('list of covariances saved in..')
with open(mcovf, 'wb') as f:
dill.dump(dlist,f)
print (mcovf)
if __name__ == "__main__":
main()
\ No newline at end of file
#!/usr/bin/python3 -O
import sys
from datetime import datetime
from distribs import DistList
from config_manager import ConfigReader
import dill
import os.path
from utils import ProgressBar
import numpy as np
from math import *
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from sklearn import cluster, covariance, manifold
class Clusters:
def __init__(self,conf):
self._conf = conf
self._NCLUSTERS = conf.clusterN()
self._THRESHOLD = 1
distfile = conf.mastercovlist()
with open(distfile,'rb') as f:
self._distlist = dill.load(f)
def spectral_centroids(self):
self._centroids=[]
for i in range(self._clabels.max()+1):
idx=self._clabels==i
mu = self._embedding[:,idx].mean(axis=1).reshape(2,1)
#find the closest point to mu
dist = np.square(self._embedding-mu).sum(axis=0)
i2 = np.argmin(dist)
covs = self._distlist._covlist
self._centroids.append((self._embedding[:,i2],i2, self._embedding[:,idx].shape[1], covs[i2]))
def graph_clusters(self):
t = 'clusters'
embedding = self._embedding
centroids = self._centroids
L = self._clabels
colormap = [cm.gnuplot2(i) for i in np.linspace(0.,1.,max(L)+2)]
colorms = [colormap[i] for i in L]
fig = plt.figure()
plt.scatter(embedding[0,:],embedding[1,:],s=10,c=colorms,alpha=0.8)
processed=set()
for i,col in enumerate(range(max(L)+1)):
if col in processed:
continue
lxy=centroids[i][0]
labelx,labely = lxy[0],lxy[1]
textx = labelx + 0.004
texty = labely + 0.01
plt.annotate(str(col),xy=(labelx,labely),xytext=(textx,texty),
arrowprops=dict(arrowstyle="->",connectionstyle="arc3"),size=6)
for i,z in enumerate(centroids):
plt.scatter(z[0][0],z[0][1],s=50,c=[colormap[i]],alpha=1.0)
plt.title(t)
plt.savefig('clusters')
plt.close(fig)
def hellinger(self,n,x,y,smooth,**kwargs):
x1 = x.reshape(n,n+1)
x2 = y.reshape(n,n+1)
mu1,sigma1=x1[:,0],x1[:,1:]
mu2,sigma2=x2[:,0],x2[:,1:]
sign1, ld1 = np.linalg.slogdet(sigma1)
sign2, ld2 = np.linalg.slogdet(sigma2)
dd = 0.5*(sigma1+sigma2)
signdd, ldd = np.linalg.slogdet(dd)
signt = (sign1 * sign2)/signdd
if signt <= 0:
print ('negative')
s_1 = np.linalg.inv(dd)
e = -0.125*(mu1-mu2).T.dot(s_1).dot(mu1-mu2)
#d = pow(d1,.25)*pow(d2,.25)/pow(det1,.5)
d = .25*(ld1+ld2-2*ldd)
d = exp(d)
es = 1-d*exp(e)
#if es <=0:
# es = 0
if smooth:
r = atanh(es)
else:
r = sqrt(es)
return r
def compute_clusters(self):
nn = self._conf.clspectral_neighbors()
node_position_model = manifold.SpectralEmbedding(n_components=2,n_neighbors=nn,random_state=42)
self._embedding = node_position_model.fit_transform(self._hellingerd.T).T
cind = self._distlist._covindex
X = self._embedding.astype('float64').T
N = self._conf.clusterN()
aff = self._conf.claffinity()
nn = self._conf.cln_neighbors()
rs = self._conf.clrandom_states()
es = self._conf.cleigen_solver()
self._agg = cluster.SpectralClustering(n_clusters=N,affinity=aff,eigen_solver=es,n_neighbors=nn,random_state=rs).fit(X)
self._clabels=self._agg.labels_
with open('clabels.plk','wb') as f:
dill.dump(self._clabels,f)
with open('clusterengine.plk','wb') as f:
dill.dump(self._agg,f)
self.spectral_centroids()
def cal_affinity_matrix(self):
if (os.path.isfile('hellinger.plk')):
self._hellingerd = dill.load(open('hellinger.plk','rb'))
return
cind = self._distlist._covindex
covs = self._distlist._covlist
hellingerd = np.full((covs.shape[0],covs.shape[0]),0.)
n = covs.shape[1]
br = ProgressBar(covs.shape[0]*covs.shape[0], prefix='Hellinger Affinities', suffix='Completed')
for u in cind:
for v in cind:
br.iterate()
if hellingerd[v,u] == 0:
hellingerd[u,v] = self.hellinger(n,covs[u],covs[v],self._conf.smooth_hellinger())
else:
hellingerd[u,v] = hellingerd[v,u]
self._hellingerd = hellingerd
with open('hellinger.plk','wb') as f:
dill.dump(hellingerd,f)
if __name__ == "__main__":
conf = ConfigReader('regimes.ini')
cl = Clusters(conf)
cl.cal_affinity_matrix()
cl.compute_clusters()
cl.graph_clusters()
\ No newline at end of file
[scav]
root=/home/eric/src/Data/axioma
datefrom=1982-01-01
dateto=1999-02-01
filetype=ret
fileformat=AXUS4-MH.
fileout=scavfactors.csv
listfields =United States,Value,Volatility,Aerospace & Defense
[distributions]
mastersource=RFreturns.csv
fields=ComEnOil,ComMeGol,CurEUR,CurJPY,EqCHN,EqEME,EqGrVaUSA,EqJPN,EqLaSmUSA,EqSeFinUSA,EqSeITeUSA,EqUSA,EqWLD,FICrHAAAUSA,FICrHBAAUSA,FICreHyUSA,FIG10YUSA,FIG3MUSA
windowsize=60
overlap=40
mastercovlist=mastercovlist.pkl
[clusters]
N=8
affinity=nearest_neighbors
eigen_solver=arpack
spectraln_neighbors=6
n_neighbors=10
random_states=48
smooth hellinger=True
\ No newline at end of file
#!/usr/bin/python3 -O
import glob
import os
from datetime import datetime
import numpy as np
import sys
import csv
from config_manager import ConfigReader
_creader = ConfigReader('regimes.ini')
_currdir='.'
def browse():
global _creader,_currdir
_currdir = os.getcwd()
os.chdir(_creader.scavroot())
suff = '.' + _creader.scavsuffix()
pref = _creader.scavformat()
f,t = _creader.dates()
for dt in np.arange(f, t, dtype='datetime64[M]'):
pdir=np.datetime_as_string(dt,unit='M').replace('-','/')
if os.path.exists(pdir):
for file in os.listdir(pdir):
if file.endswith(suff) and file.startswith(pref):
yield os.path.join(pdir,file)
def create_csv(header):
ofile = os.path.join(_currdir,_creader.scavout())
fcsv = open(ofile,'w')
print(', '.join(header),file=fcsv, flush=True)
return fcsv
def build_output():
first = True
rows = None
print ('processing files ..')
lfield = _creader.scavlistfield().split(',')
for i, f in enumerate(browse()):
with open(f,'r') as fd:
row = []
header = ['Date']
print('\r%2d'%i,flush=True,end='')
for lines in fd.readlines():
if lines.startswith('#'):
if lines.startswith('#DataDate'):
asof = np.datetime64(lines.split(':')[1].strip())
row = [asof]
continue
a = lines.split('|')
if (len(lfield) > 0 and a[0] in lfield ) or lfield is None:
row.append(float(a[1]))
header.append(a[0])
if first:
fcsv = create_csv(header)
first=False
if rows is None:
rows = np.array(row)
else:
rows = np.vstack((rows,row))
rows = rows.T
dsort=np.argsort(rows[0,:])
print ('done')
print ('flushing')
for ii, i in enumerate(np.argsort(rows[0,:])):
print('\r%2d'%i,flush=True,end='')
print(np.datetime_as_string(rows[0,i],unit='D'),',',', '.join(map(str,rows[1:,i])),file=fcsv)
print('done')
fcsv.close()
if __name__ == "__main__":
build_output()
#t,d,T = get_factors([1,2,3],datefrom='2007-01-31')
#print (T)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
from time import sleep
class ProgressBar:
def __init__(self, total, prefix='',suffix='',decimals=1,length=100, fill = '█'):
self._total = total
self._iteration=0
self._prefix = prefix
self._prefix = prefix
self._suffix = suffix
self._decimals = decimals
self._length = length
self._fill = fill
self._percent=0
def _display(self):
self._percent = ("{0:." + str(self._decimals) + "f}").format(100 * (self._iteration / float(self._total)))
filledLength = int(self._length * self._iteration // self._total)
bar = self._fill * filledLength + '-' * (self._length - filledLength)
print('\r%s |%s| %s%% %s' % (self._prefix, bar, self._percent, self._suffix), end = '\r')
# Print New Line on Complete
if self._iteration == self._total:
print()
def iterate(self, inc=1):
sleep(0)
self._iteration += inc
self._display()
if __name__ == "__main__":
pr=ProgressBar(50,prefix='Progress',suffix='Complete',length=50)
for i in range(0,50):
pr.iterate()
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment