You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1057 lines
49 KiB

import sys
import os
import glob, time
import numpy as num
import argparse
import pprint
import logging
import itertools
from pyrocko import util, model, orthodrome
from pyrocko import cake
from pyrocko.io import stationxml, quakeml
from pyrocko.dataset import crust2x2
from pyrocko.gui import marker
from shutil import copyfile
from . import cc, merge_freqs
from . import clustering as cl
from .config import GeneralSettings, cc_comp_settings, clusty_config,\
network_similarity_settings, clustering_settings
from .config_settings_default import generate_default_config
import matplotlib.pyplot as plt
np = num
ver = 'GM_04052021'
def station_selector(stations, catalog, **kwargs):
'''
small function to select subset of stations based on
e.g. maximum distance or other metadata information
(for example lower corner frequency or sensor
type could be implemented)
station allowlists and blocklists are implemented directly in
main().
:param stations: list of pyrocko stations
:param catalog: list of pyrocko events
'''
if 'mindist_stations' in kwargs.keys() and 'maxdist_stations' in kwargs.keys():
mid_events = orthodrome.geographic_midpoint(
lats=num.asarray([ev.lat for ev in catalog]),
lons=num.asarray([ev.lon for ev in catalog]),
weights=None)
subset_stations = []
for st in stations:
inter_sta_dist = orthodrome.distance_accurate50m(
st.lat, st.lon, mid_events[0], mid_events[1])/1000.
if inter_sta_dist > kwargs['mindist_stations'] and inter_sta_dist < kwargs['maxdist_stations']:
subset_stations.append(st)
elif 'maxdist_stations' in kwargs.keys() and 'mindist_stations' not in kwargs.keys():
mid_events = orthodrome.geographic_midpoint(
lats=num.asarray([ev.lat for ev in catalog]),
lons=num.asarray([ev.lon for ev in catalog]),
weights=None)
subset_stations = [st for st in stations if orthodrome.distance_accurate50m(
st.lat, st.lon, mid_events[0], mid_events[1])/1000. < kwargs['maxdist_stations']]
model.dump_stations(subset_stations, filename='used_stations.pf')
return subset_stations
def get_ncomp(ns_cnf, cc_cnf):
if ns_cnf.method_similarity_computation == 'product_combPS' or ns_cnf.combine_components:
n_comps = 1
else:
n_comps = len(cc_cnf.components)
return n_comps
#def get_phasename(cc_cnf, ns_cnf, i_ph):
# n_ph = get_nph(ns_cnf)
# if len(cc_cnf.phase) == 2 and n_ph == 1:
# phase_name = 'PScomb'
# elif len(cc_cnf.phase) == 1 and n_ph == 1:
# phase_name = cc_cnf.phase[0]
# elif len(cc_cnf.phase) == 2 and n_ph == 2:
# phase_name = cc_cnf.phase[i_ph]
# return phase_name
def comp_cc(comp,station_list, cat, cc_cnf, waveform_dir,
arrivals_array, bool_dist, n_workers):
cc.get_similarity_matrix(comp,station_list, cat, cc_cnf, waveform_dir,
arrivals_array, bool_dist, n_workers)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--init',
help='Generate example config file', required=False,
action='store_true')
parser.add_argument('--config',
help='Name of config file', required=False)
parser.add_argument('--run',
help='Start complete clusty run',
required=False, action='store_true')
parser.add_argument('--cc',
help='Run clusty to get cross-correlation matrix, no' +
' clustering started',
required=False, action='store_true')
parser.add_argument('--netsim',
help='Compute network similarity from cross-' +
'correlation matrix',
required=False, action='store_true')
parser.add_argument('--cluster',
help='Run clusty for clustering, network similarity' +
' matrix needed as input',
required=False, action='store_true')
parser.add_argument('--plot_results',
help='Plot clusty from result files',
required=False, action='store_true')
parser.add_argument('--eps_analysis',
help='Run cluster analysis to select eps',
required=False, action='store_true')
parser.add_argument('--merge_freq_results',
help='Harmonize and merge catalogs of different' +
' frequency bands',
required=False, action='store_true')
parser.add_argument('--export_stacked_traces',
help='Plot traces stacked by cluster index',
required=False, action='store_true')
parser.add_argument('--log_level', default=logging.INFO,
type=lambda x: getattr(logging, x),
help='Configure the logging level')
# add generate_config for example config later
args = parser.parse_args()
logger = logging.getLogger('__name__')
logger.setLevel(args.log_level)
# create console handler and set level
ch = logging.StreamHandler()
ch.setLevel(args.log_level)
logger.addHandler(ch)
logger.info('Welcome to clusty! (%s)' % ver)
if args.init:
logger.info('Preparing basic config file.')
fn = 'config_clusty.yaml'
conf = generate_default_config()
conf.dump(filename=fn)
logger.info('Config file name: %s' % fn)
if not args.init and not args.config:
logger.error('Clusty needs a config file.')
print(parser.print_help())
if args.config and not args.merge_freq_results:
try:
basic_cnf, cc_cnf, ns_cnf, cl_cnf = clusty_config.load(filename=args.config).settings
except:
basic_cnf, cc_cnf, ns_cnf, cl_cnf, mf_cnf = clusty_config.load(filename=args.config).settings
logger.info(basic_cnf)
logger.info(cc_cnf)
os.makedirs('%s/results' % basic_cnf.work_dir, exist_ok=True)
copyfile(args.config, '%s/results/config_copy.yaml' % basic_cnf.work_dir)
os.makedirs('%s/precalc_arrays' % basic_cnf.work_dir, exist_ok=True)
# get event catalog
try:
cat = model.load_events(filename=basic_cnf.catalog_file)#, format='yaml')
cat.sort(key=lambda ev: ev.time)
for ev in cat:
if ev.name == '' or ev.name == ' ':
ev.name = util.time_to_str(ev.time).replace(' ','_')
filename = os.path.split(basic_cnf.catalog_file)[-1].split('.')[:-1][0]
model.dump_events(cat, filename='%s/results/%s.yaml' % (basic_cnf.work_dir,filename), format='yaml')
except:
logger.warning('Cannot load event catalog. Aborting clusty run.')
sys.exit()
# load stations
station_list = []
if basic_cnf.station_file:
if not basic_cnf.station_file.endswith('xml'):
station_list = model.station.load_stations(basic_cnf.station_file)
else:
station_list = stationxml.load_xml(filename=basic_cnf.station_file).get_pyrocko_stations()
elif basic_cnf.station_file_list:
for file in basic_cnf.station_file_list:
if file.endswith('.xml'):
station_list.extend(stationxml.load_xml(filename=file).get_pyrocko_stations())
elif file.endswith('.yaml') or file.endswith('.pf'):
station_list.extend(model.station.load_stations(file))
else:
pass
if station_list == []:
logger.warning('No stations found. Please use stationxml with file'
+ 'extension ''xml'''
+ 'or pyrocko station format with file extension'
+ ' ''.pf'' or ''.yaml''.')
if basic_cnf.st_allowlist != []:
logger.info('st allow list: %s' % basic_cnf.st_allowlist)
station_list = [st for st in station_list if '%s.%s' %
(st.network, st.station)
in basic_cnf.st_allowlist]
if basic_cnf.st_blocklist != []:
station_list = [st for st in station_list if not '%s.%s' %
(st.network, st.station)
in basic_cnf.st_blocklist]
if basic_cnf.station_subset:
kwargs = basic_cnf.station_subset
station_list = station_selector(station_list, cat, **kwargs)
logger.info('Number of events: %s' % len(cat))
logger.info('Number of stations: %s %s' %(len(station_list),[st.station for st in station_list]))
if station_list == []:
logger.warning('No stations found.')
sys.exit()
if args.run or args.cc:
'''
start complete clusty run or cc-computation with cross-correlation
matrix computation
'''
if not cc_cnf.use_precalc_arrivals:
arrivals_array = num.empty((len(cc_cnf.phase),
len(station_list),
len(cat)))
arrivals_array.fill(num.nan)
logger.info('Computing arrival times.')
phases = cc_cnf.phase
if cc_cnf.compute_arrivals:
st_lats = num.asarray([st.lat for st in station_list])
st_lons = num.asarray([st.lon for st in station_list])
if 'P' in phases or 'p' in phases or 'S' in phases or 's' in phases:
if cc_cnf.vmodel == '':
vmodels = [cake.load_model(crust2_profile=crust2x2.get_profile(st.lat, st.lon))
for st in station_list]
else:
vmodel = cake.load_model(cc_cnf.vmodel)
for i_ev, ev in enumerate(cat):
ev_sta_dists = orthodrome.distance_accurate50m_numpy(
ev.lat, ev.lon, st_lats, st_lons)
for i_st, st in enumerate(station_list):
ev_sta_dist = ev_sta_dists[i_st]
if cc_cnf.vmodel == '':
vmodel = vmodels[i_st]
else:
vmodel = vmodel
for i_ph, phase_name in enumerate(phases):
if phase_name == 'P' or phase_name == 'p':
phase_names = ['p', 'P']
elif phase_name == 'S' or phase_name == 's':
phase_names = ['s', 'S']
#elif phase_name == 'R' or phase_name == 'L':
# phase_names = ['p', 'P']
# surface waves hav so long period that
# exact beginning does not matter.
# only true for regional settings, therefore
# new arrival computation added below!
arrivals = vmodel.arrivals(distances=[ev_sta_dist*cake.m2d],
phases=phase_names,
zstart=ev.depth)
if arrivals:
min_t = min(arrivals, key=lambda x: x.t).t
arrivals_array[i_ph, i_st, i_ev] = ev.time + min_t
else:
arrivals_array[i_ph, i_st, i_ev] = num.nan
elif 'R' in phases or 'r' in phases or 'L' in phases or 'l' in phases:
v_surface = 4500
for i_ev, ev in enumerate(cat):
ev_sta_dists = orthodrome.distance_accurate50m_numpy(
ev.lat, ev.lon, st_lats, st_lons)
for i_st, st in enumerate(station_list):
ev_sta_dist = ev_sta_dists[i_st]
for i_ph, ph in enumerate(phases):
arr = ev.time + (ev_sta_dist/v_surface)
arrivals_array[i_ph, i_st, i_ev] = arr
else:
logger.warning('Phase name not defined, please use phase R, L, P or S.')
num.save(file='./precalc_arrays/arrivals_array', arr=arrivals_array)
elif cc_cnf.pick_path != '' and cc_cnf.pick_option == 'pyr_path':
for i_ph, phase_name in enumerate(cc_cnf.phase):
for i_st, st in enumerate(station_list):
for i_ev, ev in enumerate(cat):
try:
picks = marker.load_markers('%s/%s.pf' %
(cc_cnf.pick_path, ev.name))
except FileNotFoundError:
arrivals_array[i_ph, i_st, i_ev] = False
continue
# reject phases without phasename
picks = [pik for pik in picks if type(pik) == marker.PhaseMarker]
try:
arr = [pick.tmin for pick in picks if
(pick.match_nsl((st.network, st.station, st.location)))][0]
# and pick._phasename == phase_name)][0]
arrivals_array[i_ph, i_st, i_ev] = arr
except IndexError: # if the given phase has no pick
arrivals_array[i_ph, i_st, i_ev] = num.nan
num.save(file='%s/precalc_arrays/arrivals_array' % basic_cnf.work_dir, arr=arrivals_array)
elif cc_cnf.pick_path != '' and cc_cnf.pick_option == 'pyr_file':
picks = marker.load_markers(cc_cnf.pick_path)
for i_ph, phase_name in enumerate(cc_cnf.phase):
for i_st, st in enumerate(station_list):
for i_ev, ev in enumerate(cat):
try:
arr = [pick.tmin for pick in picks if
(pick.match_nsl((st.network, st.station, st.location))
and pick._phasename == phase_name
and pick._event_time == ev.time)][0]
arrivals_array[i_ph, i_st, i_ev] = arr
#print(arr)
except IndexError: # if the given phase has no pick
arrivals_array[i_ph, i_st, i_ev] = num.nan
#print('error')
num.save(file='%s/precalc_arrays/arrivals_array' % basic_cnf.work_dir, arr=arrivals_array)
elif cc_cnf.pick_path != '' and cc_cnf.pick_option == 'xml':
logger.info('Generating arrivals array from picks in xml format.')
search_me = os.path.join(cc_cnf.pick_path, '*.xml')
pick_files = glob.glob(search_me) # find all files
logger.info('found pick files: %s' % pick_files)
# for sorting into arrival array matrix (has potential to be improved...)
event_iev_dict = {}
for iev, ev in enumerate(cat):
event_iev_dict[ev.name] = iev
nst_ist_dict = {}
st_ist_dict = {}
for ist, st in enumerate(station_list):
nst_ist_dict['%s.%s' % (st.network, st.station)] = ist
st_ist_dict[st.station] = ist
ph_iph_dict = {}
for iph, ph in enumerate(cc_cnf.phase):
ph_iph_dict[ph] = iph
#logger.info('dicts st: %s, ev: %s, ph: %s' % (len(st_ist_dict.keys()), len(event_iev_dict.keys()),
# len(ph_iph_dict.keys())))
for f in pick_files:
try:
qml = quakeml.QuakeML.load_xml(filename=f)
except:
logger.debug('could not load file: %s' % f)
for ev in qml.event_parameters.event_list:
evname = ev.public_id.split('/')[1]
logger.debug('event. %s' % evname)
try:
iev = event_iev_dict[evname]
logger.debug('%s %s' %(evname, cat[iev].name))
except KeyError:
logger.warning('Event not found. %s' % evname)
continue
for pick in ev.pick_list:
ph = pick.phase_hint.value
if ph == 'AML':
continue
try:
iph = ph_iph_dict[ph]
logger.debug('%s %s' %(ph, cc_cnf.phase[iph]))
except KeyError:
logger.warning('Phase not found. %s' % ph)
continue
stcode = pick.waveform_id.station_code
netcode = pick.waveform_id.network_code
if len(netcode) < 1:
logger.debug('No network code for station %s' % stcode)
try:
ist = st_ist_dict[stcode]
logger.debug('%s %s' %(stcode, station_list[ist].station))
except KeyError:
logger.info('Station not found. %s' % stcode)
continue
else:
try:
ist = nst_ist_dict['%s.%s' % (netcode, stcode)]
logger.debug('%s %s' %(stcode, station_list[ist].station))
except KeyError:
logger.info('Station not found. %s.%s' % (netcode, stcode))
continue
picktime = pick.time.value
logger.debug('pick time: %s' % picktime)
arrivals_array[iph, ist, iev] = picktime
#if stcode == 'ITM' or stcode == 'FSK' or stcode == 'KALE':
#logger.info('%s: %s %s %s' %(stcode,ph, cat[iev].name, picktime))
#logger.info('%s %s %s %s %s' %(iph, ist, iev, picktime, arrivals_array[iph, ist, iev]))
#if logger.getLogger().isEnabledFor(logger.DEBUG):
# print('pick arr: ', ph, stcode, cat[iev].name, util.tts(picktime))
# vmodel = cake.load_model(crust2_profile=crust2x2.get_profile(st.lat, st.lon))#
# if ph == 'P' or ph == 'p':
# phase_names__ = ['p', 'P']
# elif ph == 'S' or ph == 's':
# phase_names__ = ['s', 'S']
# print(ph, phase_names__)
# ev_sta_dist = orthodrome.distance_accurate50m_numpy(
# cat[iev].lat, cat[iev].lon, station_list[ist].lat, station_list[ist].lon)
# caki = vmodel.arrivals(distances=[ev_sta_dist[0]*cake.m2d],
# phases=phase_names__,
# zstart=cat[iev].depth)
#for c in caki:
# print(c)
# if caki:
# min_t = min(caki, key=lambda x: x.t).t
# print('cake arr:', util.tts(cat[iev].time + min_t))
# else:
# print('no cake')
# print('-----------------------------')
#num.set_printoptions(suppress=True,
# formatter={'float_kind':'{:16.3f}'.format}, linewidth=130)
#pprint.pprint(arrivals_array)
num.save(file='%s/precalc_arrays/arrivals_array' % basic_cnf.work_dir, arr=arrivals_array)
logger.debug('arrival-array size: %s' % str(arrivals_array.shape))
logger.debug('min arr time: %s' % str(num.nanmin(arrivals_array)))
logger.debug('max arr time: %s' % str(num.nanmax(arrivals_array)))
else:
logger.error('Please choose pick file or use cake computation.')
sys.exit()
else:
arrivals_array = num.load('./precalc_arrays/arrivals_array.npy')
if arrivals_array.shape[1] != len(station_list):
logger.error('Arrival time array dimensions does not fit number of stations.')
sys.exit()
logger.info('Arrival times succesfully calculated/loaded.')
### cross-correlations ###
if not cc_cnf.use_precalc_ccs:
bool_dist, ev_ev_dist_array = cc.check_distance(cat, cc_cnf.max_dist,cc_cnf.dist_hypocentral)
num.save(arr=ev_ev_dist_array, file='./precalc_arrays/ev_ev_dist_array')
for i_comp, comp in enumerate(cc_cnf.components):
comp_cc(comp,station_list, cat, cc_cnf, basic_cnf.waveform_dir,
arrivals_array, bool_dist, basic_cnf.n_workers)
# cc.get_similarity_matrix(comp,station_list, cat, cc_cnf, basic_cnf.waveform_dir,
# arrivals_array, bool_dist, basic_cnf.n_workers)
# coef_array, tshifts_array, weights_array = cc.get_similarity_matrix(
# comp,station_list, cat, cc_cnf, basic_cnf.waveform_dir,
# arrivals_array, bool_dist, basic_cnf.n_workers)
# print(sys.getsizeof(coef_array))
# print(sys.getsizeof(tshifts_array))
# print(sys.getsizeof(weights_array))
# num.save(arr=coef_array, file='./precalc_arrays/cccoef_array_%s' % comp_base)
# num.save(arr=weights_array, file='./precalc_arrays/ccweights_array_%s' % comp_base)
# num.save(arr=tshifts_array, file='./precalc_arrays/cctshifts_array_%s' % comp_base)
# del coef_array
# del tshifts_array
# del weights_array
#if cc_cnf.plot_npairs_stats:
# n_pairs = [num.where(coef_array[0,i] > 0.7).shape[0] for i in range(len(station_list))]
# fig, ax = plt.subplots(nrows=2)
# ax[0].plot(npairs)
# ax[0].set_ylabel('n_pairs')
# ax[0].set_xlabel('station index')
# plt.savefig('./results/npairs_stats.png')
# plt.close()
else:
for i_comp, comp in enumerate(cc_cnf.components):
comp_base = comp[-1]
logger.info('Using precalculated arrays.')
logger.debug('Found cccoef: %s' % os.path.exists('./precalc_arrays/cccoef_array_%s.npy' % comp_base))
logger.debug('Found ccweights: %s' % os.path.exists('./precalc_arrays/ccweights_array_%s.npy' % comp_base))
logger.debug('Found cctshifts: %s' % os.path.exists('./precalc_arrays/cctshifts_array_%s.npy' % comp_base))
logger.debug('Found ev_ev_dists: %s' % os.path.exists('./precalc_arrays/ev_ev_dist_array.npy'))
# if cc_cnf.plot_cc_dist:
# for i_ph, phase_name in enumerate(cc_cnf.phase):
# cc.plot_cc_over_dist(coef_array[i_ph], ev_sta_dist_array, phase_name, cc_cnf.max_dist)
logger.info('Cross-correlation section ready.')
if args.run or args.netsim:
'''
Run network similarity computation from cross-correlation matrix,
either as part of entire clusty run or of net_sim computation'
'''
### optional: compute station weights based on azimuthal distribution
az_stats_weights = None
if ns_cnf.get_station_weights:
logger.info('Starting interactive station weighting selection.')
az_stats_weights = cc.station_weighting(station_list, cat)
if not az_stats_weights == []:
num.save(arr=az_stats_weights, file='az_stats_weights')
n_comp = len(cc_cnf.components)
n_ev = len(cat)
network_similarity =num.empty((n_comp,n_ev, n_ev))
n_used_stats = num.empty((n_comp,n_ev, n_ev))
# logger.info('NETSIM allocation:::%.2f' % time.time())
for i_comp, comp in enumerate(cc_cnf.components):
comp_base = comp[-1]
if not ns_cnf.use_precalc_net_sim:
logger.info('Loading cc-arrays.')
coef_array = num.load('./precalc_arrays/cccoef_array_%s.npy' % comp_base)
if coef_array.shape[0] != len(station_list):
logger.error('CC coef. array dimensions does not fit number of stations.')
sys.exit()
weights_array = num.load('./precalc_arrays/ccweights_array_%s.npy' % comp_base)
tshifts_array = num.load('./precalc_arrays/cctshifts_array_%s.npy' % comp_base)
#ev_ev_dist_array = num.load('./precalc_arrays/ev_ev_dist_array.npy')
logger.info('Starting computation of network similarity.')
n_ph = len(cc_cnf.phase)
if not ns_cnf.method_similarity_computation == 'trimmed_mean_ccs':
network_similarity[i_comp] = cc.stack_ccs_over_stations(
coef_array, weights_array, az_stats_weights, ns_cnf,
station_list, cat)
else:
if ns_cnf.get_bool_sta_use:
network_similarity[i_comp], n_used_stats[i_comp], sta_use = cc.stack_ccs_over_stations(
coef_array, weights_array, az_stats_weights, ns_cnf,
station_list, cat)
num.save(arr=sta_use, file='./precalc_arrays/sta_use_%s.npy' % comp_base)
else:
network_similarity[i_comp], n_used_stats[i_comp] = cc.stack_ccs_over_stations(
coef_array, weights_array, az_stats_weights, ns_cnf,
station_list, cat)
num.save(arr=n_used_stats, file='./precalc_arrays/n_used_stats.npy')
num.save(arr=network_similarity, file='./precalc_arrays/network_similarity.npy')
else:
network_similarity = num.load('./precalc_arrays/network_similarity.npy')
if ns_cnf.method_similarity_computation == 'trimmed_mean_ccs':
n_used_stats = num.load('./precalc_arrays/n_used_stats.npy')
logger.info('Netsim for %s component finished.f' % (comp))
title = 'Net-sim \n(%s)' % ns_cnf.method_similarity_computation
# imshow plots might have some issues with matrix dimensions ;-)
#fig_name = ns_cnf.method_similarity_computation + phase_name
#cc.small_imshow_plot(network_similarity[i_ph], cat, title,
# cbarlabel=None, fig_name=fig_name)
#if ns_cnf.method_similarity_computation == 'trimmed_mean_ccs':
# fig_name = './results/used_stats_%s_%s.png' % (
# ns_cnf.method_similarity_computation, phase_name)
# title_s = 'n_st in net-sim comp.'
# #cc.small_imshow_plot(n_used_stats[i_ph], cat, title_s,
# # cbarlabel=None, fig_name=fig_name,
# # scale=(0,num.max(n_used_stats[i_ph])))
if not ns_cnf.combine_components:
logger.debug('Computing distance matrix.')
cl.get_distance_matrix(network_similarity[i_comp], comp)
if ns_cnf.combine_components:
logger.debug('Combining P and S network similarity results.')
network_similarity = cc.comb_avg(network_similarity, ns_cnf.weights_components)
logger.debug('Computing distance matrix.')
name = 'comb'
cl.get_distance_matrix(network_similarity[0], name)
logger.info('Network similarity and distance matrix ready')
if args.eps_analysis:
'''
input dists, pts_min
--> knn -plot for eps setting, use eps from eps config list (cl_cnf.dbscan_min_pts)
--> one subplot per eps value, make figure size variable depending on len(eps_list)
'''
# input dists, pts_min
# knn -plot for eps setting
n_comps = get_ncomp(ns_cnf, cc_cnf)
for i_comp in range(n_comps):
if n_comps == 1 and ns_cnf.combine_components:
comp_name = 'comb'
else:
comp_name = cc_cnf.components[i_comp]
dist_file = './precalc_arrays/dists_%s.npy' % comp_name
dists = num.load(dist_file)
num.fill_diagonal(dists, 0.0)
cl.knn_plot(dists, cl_cnf.dbscan_min_pts, comp_name)
for minpts in list(set(cl_cnf.dbscan_min_pts)):
eps_minpts_tuple_list = [(eps, minpts) for eps in num.arange(0.01, 0.41, 0.01)]
if cl_cnf.method == 'dbscan':
cluster_labels_list = cl.cluster_dbscan(eps_minpts_tuples=eps_minpts_tuple_list,
dist_array=dists)
else:
logger.error('Clustering method not implemented. Please use ```dbscan```.')
eps_sil_list = []
sil_score_list = []
sil_med_list = []
n_cluster_list = []
n_clustered_list = []
for i_cl, cluster_labels in enumerate(cluster_labels_list):
try:
silhouette_vals, silhouette_scr, silhouette_medi = cl.calc_silhouette(cluster_labels,dists)
n_cluster = len(set(cluster_labels))-1
n_clustered = sum(num.array(cluster_labels) != -1)
sil_score_list.append(silhouette_scr)
sil_med_list.append(silhouette_medi)
n_cluster_list.append(n_cluster)
n_clustered_list.append(n_clustered)
except ValueError:
sil_score_list.append(0)
n_cluster_list.append(0)
n_clustered_list.append(0)
sil_med_list.append(0)
eps_sil_list = [eps for (eps, minpts) in eps_minpts_tuple_list]
idx = np.argsort(eps_sil_list)
eps_sil_list = num.array(eps_sil_list)[idx]
sil_score_list = num.array(sil_score_list)[idx]
sil_med_list = num.array(sil_med_list)[idx]
n_cluster_list = num.array(n_cluster_list)[idx]
n_clustered_list = num.array(n_clustered_list)[idx]
#shall this be interactive?
cl.silhouette_scores(eps_sil_list, sil_score_list, sil_med_list, n_cluster_list, n_clustered_list, minpts, comp_name)
##########
if args.cluster or args.run:
logger.info('Starting clustering')
n_comps = get_ncomp(ns_cnf, cc_cnf)
for i_comp in range(n_comps):
if n_comps == 1 and ns_cnf.combine_components:
comp_name = 'comb'
else:
comp_name = cc_cnf.components[i_comp]
#phase_name = get_phasename(cc_cnf, ns_cnf, i_ph)
dist_file = './precalc_arrays/dists_%s.npy' % comp_name
dists = num.load(dist_file)
num.fill_diagonal(dists, 0.0)
no_dist_evs = cl.get_list_event_no_dist(dists)
# get all combinations of given eps and minpts values
eps_minpts_tuple_list = list(itertools.product(set(cl_cnf.dbscan_eps),
set(cl_cnf.dbscan_min_pts)))
eps_minpts_tuple_list.sort(key=lambda x: x[0])
if cl_cnf.method == 'dbscan':
cluster_labels_list = cl.cluster_dbscan(eps_minpts_tuples=eps_minpts_tuple_list,
dist_array=dists)
else:
logger.error('Clustering method not implemented. Please use ```dbscan```.')
eps_min_smaples_cl_list = []
# delete if not needed for harmonization:
eps_list = [eps for (eps, minpts) in eps_minpts_tuple_list]
minpts_list = [minpts for (eps, minpts) in eps_minpts_tuple_list]
methodstrings = []
for eps_minpts_tuple, cluster_labels in zip(eps_minpts_tuple_list,
cluster_labels_list):
eps, min_pts = eps_minpts_tuple
method_string = '%s_%s_%s_%.3f_%s' % (ns_cnf.method_similarity_computation,
cl_cnf.method, comp_name, eps, min_pts)
methodstrings.append(method_string)
outfile = './results/cat_%s.yaml' % method_string
cat = cl.catalog_output(cat, cluster_labels, outfile, no_dist_evs)
if cl_cnf.plot_netsim_matrix:
if args.cluster:
network_similarity = num.load('./precalc_arrays/network_similarity.npy')
cl.netsim_matrix(network_similarity[i_comp], cluster_labels, cat,
method_string)
result_catalogs = [os.path.join('results/', 'cat_' + ms+ '.yaml') for ms in methodstrings]
result_catalogs = sorted(result_catalogs)
if len(result_catalogs) > 1:
# problem if no cluster found!
cl.harmonize_cluster_labels(file_list=result_catalogs)
for cat_path in result_catalogs:
cat = model.load_events(cat_path)
cluster_labels,cluster_labels_set = cl.get_cluster_labels(cat)
n_cluster = len(cluster_labels_set)-1
n_clustered = sum(num.array(cluster_labels) != -1)
method_string = os.path.split(cat_path)[-1][4:-5]
eps = method_string.split('_')[-2]
min_pts= method_string.split('_')[-1]
if n_cluster > 0:
logger.info('Preparing Fruchtermann plot for %s ' % method_string)
cl.fruchtermann(dists=dists, cat=cat,
eps=eps, pts_min=min_pts,
cc_cnf=cc_cnf, cl_cnf=cl_cnf,
method_string=method_string)
else:
logger.info('No clusters found for %s. Skip Fruchtermann plot.' % method_string)
cl.silhouette_plot(dists, cat, method_string, eps, min_pts, cc_cnf)
if len(result_catalogs ) > 1:
logger.info('Preparing sankey plot')
cl.sankey_multi(result_catalogs, basic_cnf, plot_labels=cl_cnf.sankey_labels)
if cl_cnf.repr_ev:
eps,minpts = cl_cnf.repr_ev
for i_comp in range(n_comps):
#phase_name = get_phasename(cc_cnf, ns_cnf, i_ph)
if n_comps == 1 and ns_cnf.combine_components:
comp_name = 'comb'
else:
comp_name = cc_cnf.components[i_comp]
dists = num.load('./precalc_arrays/dists_%s.npy' % comp_name)
try:
method_string = [os.path.split(f)[-1][4:-5] for f in result_catalogs
if float((os.path.split(f)[-1][4:-5]).split('_')[-2]) == eps
and float((os.path.split(f)[-1][4:-5]).split('_')[-1]) == minpts
and (os.path.split(f)[-1][4:-5]).split('_')[-3] == comp_name][0]
result_catalog = model.load_events(os.path.join('results/', 'cat_' + method_string + '.yaml'))
except:
logger.error('Catalog for chosen eps and minpts for saving representative events not found.')
logger.info(method_string)
sys.exit()
representative_events, max_sil_vals = cl.get_repr_ev(result_catalog, dists, method_string)
if args.run or args.plot_results:
logger.info('Preparing result plots.')
eps_minpts_tuple_list = list(itertools.product(set(cl_cnf.dbscan_eps),
set(cl_cnf.dbscan_min_pts)))
methodstrings = []
n_comps = get_ncomp(ns_cnf, cc_cnf)
for i_comp in range(n_comps):
#phase_name = get_phasename(cc_cnf, ns_cnf, i_ph)
if n_comps == 1 and ns_cnf.combine_components:
comp_name = 'comb'
else:
comp_name = cc_cnf.components[i_comp]
for eps, min_pts in eps_minpts_tuple_list:
method_string = '%s_%s_%s_%.3f_%s' % (
ns_cnf.method_similarity_computation,
cl_cnf.method, comp_name, eps, min_pts)
methodstrings.append(method_string)
result_catalogs = [os.path.join('results/', 'cat_' + ms+ '.yaml')
for ms in methodstrings]
result_catalogs = sorted(result_catalogs)
#i_ph = 1
#phase_name = get_phasename(cc_cnf, ns_cnf, i_ph)
for f in result_catalogs:
result_catalog = model.load_events(f)
if len([ev for ev in result_catalog if ev.extras['cluster_number'] != -1]) == 0:
continue
method_string = os.path.split(f)[-1][4:-5]
comp_name = method_string.split('_')[-3]
if cl_cnf.plot_map:
cl.map_plot(result_catalog, method_string, cc_cnf, cl_cnf)
cl.plot_cluster_timing(result_catalog, method_string) # vielleicht noch spaeter?! final plots...
if cl_cnf.plot_hist:
cl.nev_cluster_hist(result_catalogs)
if len(result_catalogs) > 1:
cl.calc_adjusted_rand(result_catalogs)
if cl_cnf.wf_plot:
eps = cl_cnf.wf_plot[0]
minpts = cl_cnf.wf_plot[1]
logger.info('Preparing waveform plot(s) for eps %s and minpts %s.' % (eps, minpts))
if cl_cnf.wf_plot_stats == []:
subset_stations_and_indices = [(i_st, st) for i_st, st in enumerate(station_list)]
else:
subset_stations_and_indices = [(i_st, st) for i_st, st in enumerate(station_list)
if '%s.%s' % (st.network, st.station) in cl_cnf.wf_plot_stats]
arrivals_array = num.load('./precalc_arrays/arrivals_array.npy')
#try:
# tshifts_array = num.load('./precalc_arrays/cctshifts_array.npy')
#except:
# tshifts_array = None
for i_comp in range(n_comps):
#phase_name = get_phasename(cc_cnf, ns_cnf, i_ph)
if n_comps == 1 and ns_cnf.combine_components:
comp_name = 'comb'
else:
comp_name = cc_cnf.components[i_comp]
dists = num.load('./precalc_arrays/dists_%s.npy' % comp_name)
try:
method_string = [os.path.split(f)[-1][4:-5] for f in result_catalogs
if float((os.path.split(f)[-1][4:-5]).split('_')[-2]) == eps
and float((os.path.split(f)[-1][4:-5]).split('_')[-1]) == minpts
and (os.path.split(f)[-1][4:-5]).split('_')[-3] == comp_name][0]
result_catalog = model.load_events(os.path.join('results/', 'cat_' + method_string + '.yaml'))
except:
logger.error('Catalog for chosen eps and minpts for wf plot not found.')
logger.info(method_string)
sys.exit()
representative_events, max_sil_vals = cl.get_repr_ev(result_catalog, dists, method_string)
cl.wf_cluster_plot(result_catalog, subset_stations_and_indices,
basic_cnf.waveform_dir, arrivals_array, cc_cnf, method_string,
representative_events, max_sil_vals, cl_cnf)
if cl_cnf.wf_cl_snuffle:
eps = cl_cnf.wf_cl_snuffle[0]
minpts = cl_cnf.wf_cl_snuffle[1]
logger.info('Opening waveform snuffle for eps %s and minpts %s.' % (eps, minpts))
if cl_cnf.wf_plot_stats == []:
subset_stations_and_indices = [(i_st, st) for i_st, st in enumerate(station_list)]
else:
subset_stations_and_indices = [(i_st, st) for i_st, st in enumerate(station_list)
if '%s.%s' % (st.network, st.station) in cl_cnf.wf_plot_stats]
arrivals_array = num.load('./precalc_arrays/arrivals_array.npy')
#try:
# tshifts_array = num.load('./precalc_arrays/cctshifts_array.npy')
#except:
# tshifts_array = None
for i_comp in range(n_comps):
#phase_name = get_phasename(cc_cnf, ns_cnf, i_ph)
if n_comps == 1 and ns_cnf.combine_components:
comp_name = 'comb'
else:
comp_name = cc_cnf.components[i_comp]
dists = num.load('./precalc_arrays/dists_%s.npy' % comp_name)
try:
method_string = [os.path.split(f)[-1][4:-5] for f in result_catalogs
if float((os.path.split(f)[-1][4:-5]).split('_')[-2]) == eps
and float((os.path.split(f)[-1][4:-5]).split('_')[-1]) == minpts
and (os.path.split(f)[-1][4:-5]).split('_')[-3] == comp_name][0]
result_catalog = model.load_events(os.path.join('results/', 'cat_' + method_string + '.yaml'))
except:
logger.error('Catalog for chosen eps and minpts for wf plot not found.')
sys.exit()
representative_events, max_sil_vals = cl.get_repr_ev(result_catalog, dists, method_string)
cl.wf_cluster_snuffle(result_catalog, subset_stations_and_indices,
basic_cnf.waveform_dir, arrivals_array, cc_cnf, method_string,
representative_events, max_sil_vals, cl_cnf)
logger.info('Finished result plots.')
if args.merge_freq_results:
'''
input: list of config files with different frequency ranges
output: save harmonized catalogs
'''
result_catalogs = []
methodstrings =[]
basic_cnf, cc_cnf, ns_cnf, cl_cnf, mergecat_cnf = clusty_config.load(filename=args.config).settings
faults_path = cl_cnf.map_faults_path
work_dir = basic_cnf.work_dir
cat = model.load_events(basic_cnf.catalog_file)
eps_minpts_tuple_list = list(zip(mergecat_cnf.pref_eps,mergecat_cnf.pref_minpts))
for i_cat, cat_config_file in enumerate(mergecat_cnf.cat_config_list):
freq_dir_name = os.path.split(cat_config_file)[0]
basic_cnf_temp, cc_cnf_temp, ns_cnf_temp, cl_cnf_temp = clusty_config.load(filename=cat_config_file).settings
os.makedirs('%s/input/%s' % (work_dir,freq_dir_name), exist_ok=True)
copyfile(cat_config_file, '%s/input/%s' % (work_dir, cat_config_file))
logger.info('adding %s/input/%s' % (work_dir, cat_config_file))
n_comps = get_ncomp(ns_cnf_temp, cc_cnf_temp)
for i_comp in range(n_comps):
if n_comps == 1 and ns_cnf_temp.combine_components:
comp_name = 'comb'
else:
comp_name = cc_cnf_temp.components[i_comp]
dists = num.load('%s/precalc_arrays/dists_%s.npy' % (freq_dir_name,comp_name))
num.fill_diagonal(dists, 0.0)
no_dist_evs = cl.get_list_event_no_dist(dists)
logger.info('Starting clustering for final results')
if cl_cnf_temp.method == 'dbscan':
eps, min_pts = eps_minpts_tuple_list[i_cat]
cluster_labels_list = cl.cluster_dbscan(eps_minpts_tuples=[eps_minpts_tuple_list[i_cat]],
dist_array=dists)
else:
logger.error('Clustering method not implemented. Please use ```dbscan```.')
method_string = '%s_%s_%s_%.3f_%s' % (ns_cnf_temp.method_similarity_computation,
cl_cnf_temp.method, comp_name, eps, min_pts)
os.makedirs('%s/input/%s' % (work_dir,freq_dir_name), exist_ok=True)
outfile = '%s/input/%s/cat_%s.yaml' % (work_dir, freq_dir_name,method_string)
result_catalogs.append(outfile)
cat = cl.catalog_output(cat, cluster_labels_list[0], outfile, no_dist_evs)
logger.info('Starting harmonization.')
if len(result_catalogs) > 1:
method = '%s_%s_%s_' % (ns_cnf.method_similarity_computation,
cl_cnf.method, comp_name)
cl.harmonize_cluster_labels(file_list=result_catalogs, single_freq=False)
cl.sankey_multi(result_catalogs, basic_cnf, fn='%s/sankey.html' % work_dir,
single_freq=False, plot_labels=cl_cnf.sankey_labels)
#what about a new silhouette plot
#cl.silhouette_plot(dists, cat, method_string, eps, min_pts, cc_cnf)
# probably omitted
#cl.calc_adjusted_rand(result_catalogs)
logger.info('Start merging catalogs of different frequency bands.')
cat_merged = merge_freqs.merge_harmonized_catalogs(result_catalogs)
os.makedirs('%s/results' % work_dir, exist_ok=True)
model.dump_events(cat_merged, '%s/results/cat_merged_%s.yaml' % (work_dir,method),
format='yaml')
fn = '%s/results/cluster_map_%s.pdf' % (work_dir,method)
if cl_cnf.plot_map:
cl.map_plot(cat_merged, method_string,
cc_cnf=False, cl_cnf=cl_cnf,
fn=fn)
if args.export_stacked_traces:
logger.info('Preparing waveform stacking, reading prepared arrays.')
n_comps = get_ncomp(ns_cnf, cc_cnf)
for i_comp in range(n_comps):
if n_comps == 1 and ns_cnf.combine_components:
comp_name = 'comb'
else:
comp_name = cc_cnf.components[i_comp]
for eps, min_pts in eps_minpts_tuple_list:
method_string = '%s_%s_%s_%.3f_%s' % (ns_cnf.method_similarity_computation,
cl_cnf.method, comp_name, eps, min_pts)
methodstrings.append(method_string)
result_catalogs = [os.path.join('results/', 'cat_' + ms+ '.yaml') for ms in methodstrings]
result_catalogs = sorted(result_catalogs)
arrivals_array = num.load('./precalc_arrays/arrivals_array.npy')
tshifts_array = num.load('./precalc_arrays/cctshifts_array.npy')
coef_array = num.load('./precalc_arrays/cccoef_array.npy')
logger.info('Starting waveform stacking.')
for f in result_catalogs:
result_catalog = model.load_events(f)
method_string = os.path.split(f)[-1][4:-5]
cl.export_stacked_traces(result_catalog, basic_cnf.waveform_dir,
arrivals_array, coef_array,
tshifts_array, station_list,
cl_cnf.prepare_for_grond,
cl_cnf.debug_stacking,
cc_cnf.components,
method_string)
logger.info('Clusty run completed.')
if __name__ == '__main__':
main()