from comet_ml import Experiment
import comet_ml
import pickle
import os
from functools import partial
import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as acc, f1_score, roc_curve, roc_auc_score, classification_report, confusion_matrix, auc
from xgboost import XGBClassifier
from scipy.optimize import curve_fit
import sys
sys.path.append('..')
from helper.plotting import plot_roc, plot_score_vs_pt, plot_tagging_eff, plot_confusion_matrix, plot_xgb_learning_curve, plot_score_distr, plot_signal_significance
from helper.utils import signal_eff, get_optimal_threshold, convert_float64_to_float32, save_model, printmd
plt.rcParams['font.size']=16
pd.options.display.max_columns = 200
nrows_b = 250000
nrows_c = 250000
nrows_udsg = 250000
skiprows = 200000
df_b = pd.read_csv('../datasets/iter2/bjets_10-150GeV_base.csv', nrows=nrows_b, skiprows=range(1,skiprows))
df_b['flavour'] = 'b'
df_b = convert_float64_to_float32(df_b)
df_c = pd.read_csv('../datasets/iter2/cjets_10-150GeV_base.csv', nrows=nrows_c, skiprows=range(1,skiprows))
df_c['flavour'] = 'c'
df_c = convert_float64_to_float32(df_c)
df_udsg = pd.read_csv('../datasets/iter2/udsgjets_10-150GeV_base.csv', nrows=nrows_udsg, skiprows=range(1,skiprows))
df_udsg['flavour'] = 'udsg'
df_udsg = convert_float64_to_float32(df_udsg)
TODO:
def get_model_from_exp(exp_id, model_type=XGBClassifier, featnames_type=(pd.Index, pd.Series, np.array, list), scaler_type=StandardScaler, api=comet_ml.API()):
exp = api.get(exp_id)
assets = exp.get_model_asset_list(exp.get_model_names()[0])
asset_id_model = assets[ ['model' in a['fileName'] for a in assets].index(True) ]['assetId']
asset_id_featnames = assets[ ['feat' in a['fileName'] for a in assets].index(True) ]['assetId']
asset_id_scaler = assets[ ['scaler' in a['fileName'] for a in assets].index(True) ]['assetId']
model_bin = exp.get_asset(asset_id_model)
model = pickle.loads(model_bin)
assert isinstance(model, model_type)
featnames_bin = exp.get_asset(asset_id_featnames)
featnames = pickle.loads(featnames_bin)
assert isinstance(featnames, featnames_type)
scaler_bin = exp.get_asset(asset_id_scaler)
scaler = pickle.loads(scaler_bin)
assert isinstance(scaler, scaler_type)
return model, np.array(featnames), scaler
exp_id_bc_vs_udsg = 'phd/bc-vs-udsg/bcf99db8f5a94b2184e6e13161c50bbe'
# exp_id_bc_vs_udsg = 'phd/bc-vs-udsg/61c014a2ff7c49e8bae9ec466ffaa998'
exp_id_b_vs_c = 'phd/b-vs-c/3ce14e4e99d54283bc66eb24c98b6468'
clf_bc_vs_udsg , feats_bc_vs_udsg, scaler_bc_vs_udsg = get_model_from_exp(exp_id_bc_vs_udsg)
clf_b_vs_c , feats_b_vs_c , scaler_b_vs_c = get_model_from_exp(exp_id_b_vs_c)
feats_all = np.unique(np.hstack([feats_bc_vs_udsg, feats_b_vs_c]))
def short_exp_id(exp_id):
return exp_id.split('/')[-1][:6]
X = scaler_bc_vs_udsg.transform(df_b[feats_bc_vs_udsg])
y_b_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_b[feats_b_vs_c])
y_b_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]
X = scaler_bc_vs_udsg.transform(df_c[feats_bc_vs_udsg])
y_c_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_c[feats_b_vs_c])
y_c_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]
X = scaler_bc_vs_udsg.transform(df_udsg[feats_bc_vs_udsg])
y_udsg_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_udsg[feats_b_vs_c])
y_udsg_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]
n = 100000
alpha = 1
fig,ax = plt.subplots(figsize=(10,8))
ax.plot(y_udsg_proba_bc_vs_udsg[:n], y_udsg_proba_b_vs_c[:n], ',', c='b', alpha=alpha)
ax.plot(y_c_proba_bc_vs_udsg[:n], y_c_proba_b_vs_c[:n], ',', c='orange', alpha=alpha)
ax.plot(y_b_proba_bc_vs_udsg[:n], y_b_proba_b_vs_c[:n], ',', c='r', alpha=alpha)
ax.set_xlabel('score bc vs udsg')
ax.set_ylabel('score b vs c')
ax.set_xlim([0,1])
ax.set_ylim([0,1])
# plt.savefig(f'scores_2BDTs_all-flavours_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')
for y_proba_bc_vs_udsg, y_proba_b_vs_c, flavour in zip(
[y_udsg_proba_bc_vs_udsg, y_c_proba_bc_vs_udsg, y_b_proba_bc_vs_udsg],
[y_udsg_proba_b_vs_c, y_c_proba_b_vs_c, y_b_proba_b_vs_c],
['udsg', 'c', 'b'],
):
plt.figure(figsize=(7,5))
plt.hist2d(y_proba_bc_vs_udsg, y_proba_b_vs_c, bins=50, norm=mpl.colors.LogNorm(), vmin=10, vmax=3000);
plt.colorbar()
plt.xlabel('score bc vs udsg')
plt.ylabel('score b vs c')
plt.title(f'{flavour}-jets')
plt.tight_layout()
plt.xlim([0,1])
plt.ylim([0,1])
# plt.savefig(f'scores_2BDTs_{flavour}_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')
Fig. 3 in https://arxiv.org/pdf/1504.07670.pdf
b_eff, c_eff, udsg_eff = [], [], []
thresholds = np.linspace(0.84, 0.99, 40)
for t in thresholds:
print(t)
udsg_eff.append(sum(y_udsg_proba_bc_vs_udsg > t) / len(y_udsg_proba_bc_vs_udsg))
c_eff.append( sum(y_c_proba_bc_vs_udsg > t) / len(y_c_proba_bc_vs_udsg))
b_eff.append( sum(y_b_proba_bc_vs_udsg > t) / len(y_b_proba_bc_vs_udsg))
plt.plot(thresholds, udsg_eff, '.-')
plt.semilogy()
plt.plot(udsg_eff, c_eff, '.-', c='lime', label='c-jets')
plt.plot(udsg_eff, b_eff, 'r.-', label='b-jets')
plt.xlabel('$udsg$ mistagging rate')
plt.ylabel('($b,c$)-jet tag efficiency')
plt.xlim(0,0.02)
plt.ylim(0,1)
plt.grid()
plt.legend()
plt.tight_layout()
# plt.savefig(f'perf_comparison_LHCb_{short_exp_id(exp_id_bc_vs_udsg)}_unzoom.png')
idx_b = df_b.query('Jet_Pt > 30 and Jet_Pt < 40').index
idx_udsg = df_udsg.query('Jet_Pt > 30 and Jet_Pt < 40').index
b_eff, udsg_eff = [], []
b_eff_3040, udsg_eff_3040 = [], []
thresholds = np.linspace(0.001, 0.999, 50)
for t in thresholds:
print(t)
udsg_eff.append(sum(y_udsg_proba_bc_vs_udsg > t) / len(y_udsg_proba_bc_vs_udsg))
b_eff.append( sum(y_b_proba_bc_vs_udsg > t) / len(y_b_proba_bc_vs_udsg))
udsg_eff_3040.append(sum(y_udsg_proba_bc_vs_udsg[idx_udsg] > t) / len(y_udsg_proba_bc_vs_udsg[idx_udsg]))
b_eff_3040.append( sum(y_b_proba_bc_vs_udsg[idx_b] > t) / len(y_b_proba_bc_vs_udsg[idx_b]))
plt.figure(figsize=(6,7))
plt.plot(b_eff[:-1], udsg_eff[:-1], '.-', label='$10 < p_T < 150$ GeV/c')
plt.plot(b_eff_3040[:-1], udsg_eff_3040[:-1], '.-', label='$30 < p_T < 40$ GeV/c')
plt.legend()
plt.semilogy()
plt.ylim(1e-4,1)
plt.xlim(0,1)
plt.xlabel('b-jets tag. eff.')
plt.ylabel('udsg-jets mistag. rate')
plt.grid()
plt.scatter([0.42, 0.66, 0.865], [2.8e-2, 1.05e-1, 0.38], c=['lime', 'r', 'k'])
plt.tight_layout()
# plt.savefig(f'perf_comparison_HadiHassan_{short_exp_id(exp_id_bc_vs_udsg)}.png')
Fig. 2 in https://arxiv.org/pdf/1709.08497.pdf
idx_b = df_b.query('Jet_Pt > 30 and Jet_Pt < 40').index
idx_udsg = df_udsg.query('Jet_Pt > 30 and Jet_Pt < 40').index
b_eff, c_eff, udsg_eff = [], [], []
b_eff_3040, udsg_eff_3040 = [], []
thresholds = np.linspace(0.001, 0.999, 50)
for t in thresholds:
print(t)
udsg_eff.append(sum(y_udsg_proba_bc_vs_udsg > t) / len(y_udsg_proba_bc_vs_udsg))
b_eff.append( sum(y_b_proba_bc_vs_udsg > t) / len(y_b_proba_bc_vs_udsg))
udsg_eff_3040.append(sum(y_udsg_proba_bc_vs_udsg[idx_udsg] > t) / len(y_udsg_proba_bc_vs_udsg[idx_udsg]))
b_eff_3040.append( sum(y_b_proba_bc_vs_udsg[idx_b] > t) / len(y_b_proba_bc_vs_udsg[idx_b]))
plt.figure(figsize=(9,6))
plt.plot(b_eff[:-1], udsg_eff[:-1], '.-', label='$10 < p_T < 150$ GeV/c')
plt.plot(b_eff_3040[:-1], udsg_eff_3040[:-1], '.-', label='$30 < p_T < 40$ GeV/c')
plt.legend()
plt.semilogy()
plt.ylim(7e-8,1)
plt.xlim(0.14, 0.86)
plt.xlabel('b-jets tag. eff.')
plt.ylabel('udsg-jets mistag. rate')
plt.grid()
# plt.scatter([0.42, 0.66, 0.865], [2.8e-2, 1.05e-1, 0.38], c=['lime', 'r', 'k'])
plt.tight_layout()
plt.gca().set_yticks([1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]);
# plt.savefig(f'perf_comparison_Rudiger_{short_exp_id(exp_id_bc_vs_udsg)}.png')
# read all data and do not skip anything (nothing was used for training)
df_data = pd.read_csv('../datasets/iter2/alljets_10-150GeV_base.csv')
df_data['flavour'] = 'data'
df_data = convert_float64_to_float32(df_data)
X = scaler_bc_vs_udsg.transform(df_data[feats_bc_vs_udsg])
y_data_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_data[feats_b_vs_c])
y_data_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]
kwargs = dict(bins=np.linspace(0,150,151), density=0, histtype='step' , lw=2)
plt.figure(figsize=(8,5))
plt.hist(df_udsg['Jet_Pt'], color='blue' , label='udsg' , **kwargs);
plt.hist(df_c['Jet_Pt'], color='orange' , label='c' , **kwargs);
plt.hist(df_b['Jet_Pt'], color='r' , label='b' , **kwargs);
plt.hist(df_data['Jet_Pt'], color='k' , label='data' , **kwargs);
plt.legend()
plt.grid()
plt.semilogy()
plt.xlabel('jet $p_T^{reco}$ [GeV/c]');
plt.ylabel('counts');
# plt.savefig('pT_spectrum_by_flavour.png')
kwargs = dict(bins=np.linspace(-0.5,39.5,41), density=1, histtype='step' , lw=2)
var = 'Jet_NumTracks'
pt_queries = ['Jet_Pt > 0 and Jet_Pt < 150', 'Jet_Pt > 10 and Jet_Pt < 20', 'Jet_Pt > 20 and Jet_Pt < 30', 'Jet_Pt > 30 and Jet_Pt < 50', 'Jet_Pt > 50 and Jet_Pt < 100']
for query in pt_queries:
plt.figure(figsize=(8,5))
plt.hist(df_udsg[var][df_udsg.query(query).index], color='blue' , label='udsg' , **kwargs);
plt.hist(df_c[var][df_c.query(query).index], color='orange' , label='c' , **kwargs);
plt.hist(df_b[var][df_b.query(query).index], color='r' , label='b' , **kwargs);
plt.hist(df_data[var][df_data.query(query).index], color='k' , label='data' , **kwargs);
plt.legend()
plt.grid()
plt.semilogy()
plt.xlabel('N tracks in jet');
plt.ylabel('probability');
# plt.savefig('pT_spectrum_by_flavour.png')
kwargs = dict(bins=np.linspace(0,1,200), density=1, histtype='step' , lw=2)
plt.figure(figsize=(8,5))
plt.hist(y_udsg_proba_bc_vs_udsg, color='blue' , label='udsg' , **kwargs);
plt.hist(y_c_proba_bc_vs_udsg, color='orange' , label='c' , **kwargs);
plt.hist(y_b_proba_bc_vs_udsg, color='r' , label='b' , **kwargs);
plt.hist(y_data_proba_bc_vs_udsg, color='k' , label='data' , **kwargs);
plt.legend(loc='lower center')
plt.grid()
plt.semilogy()
plt.xlabel('score $bc$ vs $udsg$');
plt.ylabel('counts');
# plt.savefig(f'score_bc_vs_udsg_{short_exp_id(exp_id_bc_vs_udsg)}_by_flavour.png')
kwargs = dict(bins=np.linspace(0,1,200), density=1, histtype='step' , lw=2)
plt.figure(figsize=(8,5))
plt.hist(y_udsg_proba_b_vs_c, color='blue' , label='udsg' , **kwargs);
plt.hist(y_c_proba_b_vs_c, color='orange' , label='c' , **kwargs);
plt.hist(y_b_proba_b_vs_c, color='r' , label='b' , **kwargs);
plt.hist(y_data_proba_b_vs_c, color='k' , label='data' , **kwargs);
plt.legend(loc='lower center')
plt.grid()
plt.semilogy()
plt.xlabel('score $b$ vs $c$');
plt.ylabel('counts');
# plt.savefig(f'score_b_vs_c_{short_exp_id(exp_id_b_vs_c)}_by_flavour.png')
for y_proba_bc_vs_udsg, y_proba_b_vs_c, flavour in zip(
[y_data_proba_bc_vs_udsg, y_udsg_proba_bc_vs_udsg, y_c_proba_bc_vs_udsg, y_b_proba_bc_vs_udsg],
[y_data_proba_b_vs_c , y_udsg_proba_b_vs_c, y_c_proba_b_vs_c, y_b_proba_b_vs_c ],
['data' , 'udsg', 'c', 'b' ],
):
plt.figure(figsize=(7,5))
plt.hist2d(y_proba_bc_vs_udsg, y_proba_b_vs_c, bins=50, norm=mpl.colors.LogNorm(), cmin=50);
plt.colorbar()
plt.xlabel('score bc vs udsg')
plt.ylabel('score b vs c')
plt.title(f'{flavour}-jets')
plt.tight_layout()
plt.xlim([0,1])
plt.ylim([0,1])
# plt.savefig(f'scores_2BDTs_{flavour}_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')
pt_queries = ['Jet_Pt > 10 and Jet_Pt < 150', 'Jet_Pt > 10 and Jet_Pt < 20', 'Jet_Pt > 20 and Jet_Pt < 30', 'Jet_Pt > 30 and Jet_Pt < 40', 'Jet_Pt > 40 and Jet_Pt < 100']
kwargs = dict(bins=np.linspace(0,1,51), density=1, histtype='step' , lw=2)
logy = 0
for query in pt_queries:
plt.figure(figsize=(8,5))
plt.hist(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index], color='blue' , label='udsg' , **kwargs);
plt.hist(y_c_proba_bc_vs_udsg[df_c.query(query).index], color='orange' , label='c' , **kwargs);
yrange = plt.ylim()
plt.hist(y_b_proba_bc_vs_udsg[df_b.query(query).index], color='r' , label='b' , **kwargs);
plt.hist(y_data_proba_bc_vs_udsg[df_data.query(query).index], color='k' , label='data' , **kwargs);
if logy:
plt.semilogy()
plt.legend(loc='lower center')
plt.ylim(1e-3, 1e2)
suffix = ''
else:
plt.legend(loc=(0.6, 0.6))
# plt.ylim(yrange[0], yrange[1]*1.2)
plt.ylim(0,10)
suffix = '_liny'
plt.grid()
plt.xlabel('score $bc$ vs $udsg$');
plt.ylabel('probability');
plt.text(0.05, 0.9, query, transform=plt.gca().transAxes)
plt.tight_layout()
pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
fname = f'score_bc_vs_udsg_{short_exp_id(exp_id_bc_vs_udsg)}_by_flavour_pt{pt_range}{suffix}.png'
print(fname)
# plt.savefig(fname)
kwargs = dict(bins=np.linspace(0,1,51), density=1, histtype='step' , lw=2)
logy = 0
for query in pt_queries:
plt.figure(figsize=(8,5))
plt.hist(y_udsg_proba_b_vs_c[df_udsg.query(query).index], color='blue' , label='udsg' , **kwargs);
plt.hist(y_c_proba_b_vs_c[df_c.query(query).index], color='orange' , label='c' , **kwargs);
yrange = plt.ylim()
plt.hist(y_b_proba_b_vs_c[df_b.query(query).index], color='r' , label='b' , **kwargs);
plt.hist(y_data_proba_b_vs_c[df_data.query(query).index], color='k' , label='data' , **kwargs);
if logy:
plt.semilogy()
plt.legend(loc='lower center')
plt.ylim(1e-3,2e1)
suffix = ''
plt.text(0.4, 0.9, query, transform=plt.gca().transAxes)
else:
plt.legend(loc=(0.6, 0.6))
# plt.ylim(yrange[0], yrange[1]*1.2)
plt.ylim(0,10)
suffix = '_liny'
plt.text(0.05, 0.9, query, transform=plt.gca().transAxes)
plt.grid()
plt.xlabel('score $b$ vs $c$');
plt.ylabel('counts');
plt.tight_layout()
pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
fname = f'score_b_vs_c_{short_exp_id(exp_id_b_vs_c)}_by_flavour_pt{pt_range}{suffix}.png'
print(fname)
# plt.savefig(fname)
# for q in ['Jet_Pt < 20', 'Jet_Pt > 20 and Jet_Pt < 30', 'Jet_Pt > 30 and Jet_Pt < 50', 'Jet_Pt > 50']:
n_bins = 15
for query in pt_queries:
print()
fig, axes = plt.subplots(ncols=4, figsize=(20,5))
for y_proba_bc_vs_udsg, y_proba_b_vs_c, flavour, df, ax in zip(
[y_data_proba_bc_vs_udsg, y_udsg_proba_bc_vs_udsg, y_c_proba_bc_vs_udsg, y_b_proba_bc_vs_udsg],
[y_data_proba_b_vs_c , y_udsg_proba_b_vs_c, y_c_proba_b_vs_c, y_b_proba_b_vs_c ],
['data' , 'udsg', 'c', 'b' ],
[df_data , df_udsg , df_c , df_b ],
axes,
):
idx = df.query(query).index
cmin = int(1e-4*len(idx))
print(cmin)
ax.hist2d(y_proba_bc_vs_udsg[idx], y_proba_b_vs_c[idx],
bins=[np.linspace(0,1,n_bins+1), np.linspace(0,1,n_bins+1)],
norm=mpl.colors.LogNorm(),
cmin=cmin,
# density=1,
# vmin=1e-6, vmax=1e4,
);
# plt.colorbar()
ax.set_xlabel('score bc vs udsg')
ax.set_ylabel('score b vs c')
pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
ax.set_title(f'{flavour}-jets pT {pt_range} GeV/c')
plt.tight_layout()
ax.set_xlim([0,1])
ax.set_ylim([0,1])
fname = f'scores_2BDTs_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}_pt{pt_range}.png'
# plt.savefig(fname)
bins = np.linspace(0,1,101)
bin_centers = (bins[:-1]+bins[1:])/2
kwargs_np_hist = dict(bins=bins, density=0)
counts_udsg, x = np.histogram(y_udsg_proba_bc_vs_udsg, **kwargs_np_hist)
counts_c, x = np.histogram(y_c_proba_bc_vs_udsg , **kwargs_np_hist)
counts_b, x = np.histogram(y_b_proba_bc_vs_udsg , **kwargs_np_hist)
# y_test = y_udsg_proba_bc_vs_udsg*80 + y_c_proba_bc_vs_udsg*20 #+ y_b_proba_bc_vs_udsg*5
# counts_test, x = np.histogram(y_test , **kwargs_np_hist)
counts_test = counts_udsg*87 + counts_c*10 + counts_b*3
noise = np.random.randint(1,100000,len(bin_centers))-50000 #+ np.linspace(0,500000, len(bin_centers)) - 250000
counts_test += noise.astype(int)
def func(x, n_udsg, n_c, n_b):
counts = counts_udsg*n_udsg + counts_c*n_c + counts_b*n_b
ret = []
for x_i in x:
v = [c for bc, c in zip(bin_centers, counts) if abs(bc-x_i) < 1e-6][0]
ret.append(v)
return ret
popt, pcov = curve_fit(func, bin_centers, counts_test)
plt.figure(figsize=(8,6))
plt.plot(bin_centers, counts_udsg, label='udsg')
plt.plot(bin_centers, counts_c, label='c')
plt.plot(bin_centers, counts_b, label='b')
plt.plot(bin_centers, counts_test/100,'-', lw=2, c='k', label='artif. mix. udsg+c+b + noise')
plt.plot(bin_centers, (counts_test-noise.astype(int))/100,'--', lw=2, c='k', label='artif. mix udsg+c+b')
plt.semilogy()
plt.legend()
print(f'fractions from fit: {popt}')
popt
def make_fit(query, clf_type, nbins, logy, verbose=True):
bins = np.linspace(0,1,nbins+1)
bin_centers = (bins[:-1]+bins[1:])/2
kwargs_np_hist = dict(bins=bins, density=0)
if clf_type == 'bc_vs_udsg':
counts_udsg, x = np.histogram(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index] , **kwargs_np_hist)
counts_c, x = np.histogram(y_c_proba_bc_vs_udsg[df_c.query(query).index] , **kwargs_np_hist)
counts_b, x = np.histogram(y_b_proba_bc_vs_udsg[df_b.query(query).index] , **kwargs_np_hist)
counts_data, x = np.histogram(y_data_proba_bc_vs_udsg[df_data.query(query).index] , **kwargs_np_hist)
elif clf_type == 'b_vs_c':
counts_udsg, x = np.histogram(y_udsg_proba_b_vs_c[df_udsg.query(query).index] , **kwargs_np_hist)
counts_c, x = np.histogram(y_c_proba_b_vs_c[df_c.query(query).index] , **kwargs_np_hist)
counts_b, x = np.histogram(y_b_proba_b_vs_c[df_b.query(query).index] , **kwargs_np_hist)
counts_data, x = np.histogram(y_data_proba_b_vs_c[df_data.query(query).index] , **kwargs_np_hist)
pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
def func(x, n_udsg, n_c, n_b):
counts = counts_udsg*n_udsg + counts_c*n_c + counts_b*n_b
ret = []
for x_i in x:
v = [c for bc, c in zip(bin_centers, counts) if abs(bc-x_i) < 1e-6][0]
ret.append(v)
return ret
sum_data = np.sum(counts_data)
sum_udsg = np.sum(counts_udsg)
fit_max = sum_data/sum_udsg
sigmas = [np.sqrt(c) if c else 1e8 for c in counts_data]
popt, pcov = curve_fit(func, bin_centers, counts_data, sigma=sigmas,
bounds=([0.7*fit_max,0,0], [fit_max,fit_max,fit_max]))
residuals = func(bin_centers, *popt) - counts_data
chi2 = np.sum((residuals / sigmas)**2)
ndf = len(bin_centers) - 3
if verbose:
print(f'results for pT = {pt_range}, {nbins} bins: \n\t\t\t fit_max={fit_max:.3f} raw frac. = {[round(p,4) for p in popt]}\n\t\t \
frac. = {[round(p/sum(popt),3) for p in popt]} uncert. = {[round(pc/sum(popt),5) for pc in pcov[[0,1,2], [0,1,2]]]}\n\t\t \
chi2 / Ndf = {chi2:.1f}/{ndf} = {chi2/ndf:.1f}')
fig,axes = plt.subplots(figsize=(16,4), ncols=2)
kwargs_plt_hist = dict(bins=bins, density=1, histtype='step', lw=2)
ax = axes[0]
if clf_type == 'bc_vs_udsg':
ax.hist(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index] , **kwargs_plt_hist, color='b', label='udsg')
ax.hist(y_c_proba_bc_vs_udsg[df_c.query(query).index] , **kwargs_plt_hist, color='orange', label='c')
ax.hist(y_b_proba_bc_vs_udsg[df_b.query(query).index] , **kwargs_plt_hist, color='r', label='r')
ax.hist(y_data_proba_bc_vs_udsg[df_data.query(query).index] , **kwargs_plt_hist, color='k', label='data')
axes[0].set_xlabel('score $bc$-vs-$udsg$')
axes[1].set_xlabel('score $bc$-vs-$udsg$')
elif clf_type == 'b_vs_c':
ax.hist(y_udsg_proba_b_vs_c[df_udsg.query(query).index] , **kwargs_plt_hist, color='b', label='udsg')
ax.hist(y_c_proba_b_vs_c[df_c.query(query).index] , **kwargs_plt_hist, color='orange', label='c')
ax.hist(y_b_proba_b_vs_c[df_b.query(query).index] , **kwargs_plt_hist, color='r', label='r')
ax.hist(y_data_proba_b_vs_c[df_data.query(query).index] , **kwargs_plt_hist, color='k', label='data')
axes[0].set_xlabel('score $b$-vs-$c$')
axes[1].set_xlabel('score $b$-vs-$c$')
if logy:
ax.legend(loc='lower center')
ax.semilogy()
else:
ax.legend(loc='upper left')
ax.set_ylim(0, 10)
ax = axes[1]
ax.bar(bin_centers, counts_udsg*popt[0], 1/len(bin_centers) , label='udsg', color='b', alpha=0.5)
ax.bar(bin_centers, counts_c*popt[1] , 1/len(bin_centers), bottom=counts_udsg*popt[0] , label='c', color='orange', alpha=0.7)
ax.bar(bin_centers, counts_b*popt[2] , 1/len(bin_centers), bottom=counts_udsg*popt[0]+counts_c*popt[1], label='b', color='r', alpha=0.8)
ax.errorbar(bin_centers, counts_data, yerr=np.sqrt(counts_data), fmt='o', color='k', alpha=0.8, label='data')
if logy:
ax.legend(loc='lower center')
ax.semilogy()
txt_pos = 0.98, 0.9
else:
ax.legend(loc='upper right')
txt_pos = 0.98, 0.4
ax.text(*txt_pos, f'chi2/Ndf = {chi2/ndf:.1f}',
transform=ax.transAxes,
fontsize=12, horizontalalignment='right',
bbox=dict(facecolor='white', alpha=0.2)
)
plt.suptitle(query)
return axes
below you can find plots of fitting data score distribution to templates coming from MC.
Templates are shown on the left and fitted distribution on the right.
There are plots for:
bc-vs-udsg (linscale)
for query in pt_queries[1:]:
make_fit(query, clf_type='bc_vs_udsg', nbins=50, logy=0, verbose=0)
bc-vs-udsg (logscale)
for query in pt_queries[1:]:
make_fit(query, clf_type='bc_vs_udsg', nbins=50, logy=1, verbose=0)
b-vs-c (linscale)
for query in pt_queries[1:]:
make_fit(query, clf_type='b_vs_c', nbins=50, logy=0, verbose=0)
b-vs-c (logscale)
for query in pt_queries[1:]:
make_fit(query, clf_type='b_vs_c', nbins=50, logy=1, verbose=0)
def func(x, n_udsg, n_c, n_b):
counts = counts_udsg*n_udsg + counts_c*n_c + counts_b*n_b
ret = []
for x_i in x:
v = [c for bc, c in zip(bin_centers, counts) if abs(bc-x_i) < 1e-6][0]
ret.append(v)
return ret
# nbins = 20
logy = 0
dpt = 0.1
fit_res = dict()
for query_unvaried in pt_queries[1:]:
print()
lim1,lim2 = [int(x) for x in np.array(query_unvaried.split(' '))[[2,6]]]
fit_res[(lim1,lim2)] = dict(udsg=[], c=[], b=[])
for lim1_varied in [lim1, lim1*(1+dpt), lim1*(1-dpt)]:
for lim2_varied in [lim2, lim2*(1+dpt), lim2*(1-dpt)]:
for nbins in [30, 50, 100]:
query = f'Jet_Pt > {lim1_varied:.1f} and Jet_Pt < {lim2_varied:.1f}'
bins = np.linspace(0,1,nbins+1)
bin_centers = (bins[:-1]+bins[1:])/2
kwargs_np_hist = dict(bins=bins, density=0)
# counts_udsg, x = np.histogram(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index] , **kwargs_np_hist)
# counts_c, x = np.histogram(y_c_proba_bc_vs_udsg[df_c.query(query).index] , **kwargs_np_hist)
# counts_b, x = np.histogram(y_b_proba_bc_vs_udsg[df_b.query(query).index] , **kwargs_np_hist)
# counts_data, x = np.histogram(y_data_proba_bc_vs_udsg[df_data.query(query).index] , **kwargs_np_hist)
counts_udsg, x = np.histogram(y_udsg_proba_b_vs_c[df_udsg.query(query).index] , **kwargs_np_hist)
counts_c, x = np.histogram(y_c_proba_b_vs_c[df_c.query(query).index] , **kwargs_np_hist)
counts_b, x = np.histogram(y_b_proba_b_vs_c[df_b.query(query).index] , **kwargs_np_hist)
counts_data, x = np.histogram(y_data_proba_b_vs_c[df_data.query(query).index] , **kwargs_np_hist)
pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
sum_data = np.sum(counts_data)
sum_udsg = np.sum(counts_udsg)
fit_max = sum_data/sum_udsg
sigmas = [np.sqrt(c) if c else 1e8 for c in counts_data]
popt, pcov = curve_fit(func, bin_centers, counts_data, sigma=sigmas,
bounds=([fit_max*0.7, 0, 0], fit_max))
residuals = func(bin_centers, *popt) - counts_data
chi2 = np.sum((residuals / sigmas)**2)
ndf = len(bin_centers) - 3
print(f'q={query}, nbins={nbins}: \tchi2/Ndf = {chi2/ndf:.1f} \t frac = {[round(p/sum(popt),3) for p in popt]}')
fit_res[(lim1,lim2)]['udsg'].append(popt[0]/sum(popt))
fit_res[(lim1,lim2)]['c'].append(popt[1]/sum(popt))
fit_res[(lim1,lim2)]['b'].append(popt[2]/sum(popt))
# print(f'results for pT = {pt_range}, {nbins} bins: {[round(p,4) for p in popt]} {[round(p/sum(popt),3) for p in popt]}\t{[round(pc/sum(popt),5) for pc in pcov[[0,1,2], [0,1,2]]]}')
x1,x2 = [],[]
x = []
xerr = []
y = []
yerr = []
for k,v in fit_res.items():
print(k)
x1.append(k[0])
x2.append(k[1])
x.append((k[0]+k[1])/2)
xerr.append((k[1]-k[0])/2)
r = v['c']
y.append(np.median(r))
yerr.append(np.std(r))
# yerr.append(np.percentile(r,75) - np.percentile(r,25))
plt.figure(figsize=(6,5))
plt.errorbar(x,y,yerr=yerr,xerr=xerr, fmt='o');
plt.ylim(bottom=0)
plt.grid()
plt.xlabel('$p_T^{\,jet}$ [GeV/c]');
plt.ylabel('$c$-fraction')
plt.title('based on template fit to $b$-vs-$c$ score distr.', fontsize=12)
plt.tight_layout()
# plt.savefig('c-frac_template-fit_b-vs-c.png')
fit_res[(30,40)]['b']
plt.hist(fit_res[(30,40)]['c'], bins=10);