LINK TO PLOTS OF TEMPLATE FITTING ¶

from comet_ml import Experiment
import comet_ml
import pickle

import os
from functools import partial

import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as acc, f1_score, roc_curve, roc_auc_score, classification_report, confusion_matrix, auc
from xgboost import XGBClassifier
from scipy.optimize import curve_fit

import sys 
sys.path.append('..')
from helper.plotting import plot_roc, plot_score_vs_pt, plot_tagging_eff, plot_confusion_matrix, plot_xgb_learning_curve, plot_score_distr, plot_signal_significance
from helper.utils import signal_eff, get_optimal_threshold, convert_float64_to_float32, save_model, printmd

plt.rcParams['font.size']=16
pd.options.display.max_columns = 200

Load data from csv¶

nrows_b    = 250000
nrows_c    = 250000
nrows_udsg = 250000

skiprows   = 200000

df_b = pd.read_csv('../datasets/iter2/bjets_10-150GeV_base.csv', nrows=nrows_b, skiprows=range(1,skiprows))
df_b['flavour'] = 'b'
df_b = convert_float64_to_float32(df_b)

df_c = pd.read_csv('../datasets/iter2/cjets_10-150GeV_base.csv', nrows=nrows_c, skiprows=range(1,skiprows))
df_c['flavour'] = 'c'
df_c = convert_float64_to_float32(df_c)

df_udsg = pd.read_csv('../datasets/iter2/udsgjets_10-150GeV_base.csv', nrows=nrows_udsg, skiprows=range(1,skiprows))
df_udsg['flavour'] = 'udsg'
df_udsg = convert_float64_to_float32(df_udsg)

Load models from comet.ml¶

TODO:

?? select_best_model(metric='test_roc_auc', constrains='test_metric / train_metric < 1.05')

def get_model_from_exp(exp_id, model_type=XGBClassifier, featnames_type=(pd.Index, pd.Series, np.array, list), scaler_type=StandardScaler, api=comet_ml.API()):
    exp = api.get(exp_id)
    assets = exp.get_model_asset_list(exp.get_model_names()[0])
    asset_id_model     = assets[  ['model' in a['fileName']     for a in assets].index(True)  ]['assetId']
    asset_id_featnames = assets[  ['feat' in a['fileName'] for a in assets].index(True)  ]['assetId']
    asset_id_scaler    = assets[  ['scaler' in a['fileName']    for a in assets].index(True)  ]['assetId']

    model_bin = exp.get_asset(asset_id_model)
    model = pickle.loads(model_bin)
    assert isinstance(model, model_type)
    
    featnames_bin = exp.get_asset(asset_id_featnames)
    featnames = pickle.loads(featnames_bin)
    assert isinstance(featnames, featnames_type)
    
    scaler_bin = exp.get_asset(asset_id_scaler)
    scaler = pickle.loads(scaler_bin)
    assert isinstance(scaler, scaler_type)
    
    return model, np.array(featnames), scaler

exp_id_bc_vs_udsg = 'phd/bc-vs-udsg/bcf99db8f5a94b2184e6e13161c50bbe'
# exp_id_bc_vs_udsg = 'phd/bc-vs-udsg/61c014a2ff7c49e8bae9ec466ffaa998'
exp_id_b_vs_c     = 'phd/b-vs-c/3ce14e4e99d54283bc66eb24c98b6468' 
clf_bc_vs_udsg , feats_bc_vs_udsg, scaler_bc_vs_udsg = get_model_from_exp(exp_id_bc_vs_udsg)
clf_b_vs_c     , feats_b_vs_c    , scaler_b_vs_c     = get_model_from_exp(exp_id_b_vs_c)
feats_all = np.unique(np.hstack([feats_bc_vs_udsg, feats_b_vs_c]))

def short_exp_id(exp_id):
    return exp_id.split('/')[-1][:6]

Apply models¶

X = scaler_bc_vs_udsg.transform(df_b[feats_bc_vs_udsg])
y_b_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_b[feats_b_vs_c])
y_b_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

X = scaler_bc_vs_udsg.transform(df_c[feats_bc_vs_udsg])
y_c_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_c[feats_b_vs_c])
y_c_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

X = scaler_bc_vs_udsg.transform(df_udsg[feats_bc_vs_udsg])
y_udsg_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_udsg[feats_b_vs_c])
y_udsg_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

Plot scores distributions¶

2D histos / scatterplots for each flavour¶

n = 100000
alpha = 1

fig,ax = plt.subplots(figsize=(10,8))
ax.plot(y_udsg_proba_bc_vs_udsg[:n], y_udsg_proba_b_vs_c[:n], ',', c='b', alpha=alpha)
ax.plot(y_c_proba_bc_vs_udsg[:n], y_c_proba_b_vs_c[:n], ',', c='orange', alpha=alpha)
ax.plot(y_b_proba_bc_vs_udsg[:n], y_b_proba_b_vs_c[:n], ',', c='r', alpha=alpha)
ax.set_xlabel('score bc vs udsg')
ax.set_ylabel('score b vs c')
ax.set_xlim([0,1])
ax.set_ylim([0,1])
# plt.savefig(f'scores_2BDTs_all-flavours_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')

(0, 1)

for y_proba_bc_vs_udsg, y_proba_b_vs_c, flavour in zip(
                                        [y_udsg_proba_bc_vs_udsg, y_c_proba_bc_vs_udsg, y_b_proba_bc_vs_udsg],
                                        [y_udsg_proba_b_vs_c,     y_c_proba_b_vs_c,     y_b_proba_b_vs_c],
                                        ['udsg',                  'c',                  'b'],
                                    ):
    plt.figure(figsize=(7,5))
    plt.hist2d(y_proba_bc_vs_udsg, y_proba_b_vs_c, bins=50, norm=mpl.colors.LogNorm(), vmin=10, vmax=3000);
    plt.colorbar()
    plt.xlabel('score bc vs udsg')
    plt.ylabel('score b vs c')
    plt.title(f'{flavour}-jets')
    plt.tight_layout()
    plt.xlim([0,1])
    plt.ylim([0,1])
#     plt.savefig(f'scores_2BDTs_{flavour}_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')

Comparison with others¶

LHCb¶

Fig. 3 in https://arxiv.org/pdf/1504.07670.pdf

b_eff, c_eff, udsg_eff = [], [], []
thresholds = np.linspace(0.84, 0.99, 40)
for t in thresholds:
    print(t)
    udsg_eff.append(sum(y_udsg_proba_bc_vs_udsg > t) / len(y_udsg_proba_bc_vs_udsg))
    c_eff.append(   sum(y_c_proba_bc_vs_udsg > t)    / len(y_c_proba_bc_vs_udsg))
    b_eff.append(   sum(y_b_proba_bc_vs_udsg > t)    / len(y_b_proba_bc_vs_udsg))

0.84
0.8438461538461538
0.8476923076923076
0.8515384615384615
0.8553846153846154
0.8592307692307692
0.8630769230769231
0.8669230769230769
0.8707692307692307
0.8746153846153846
0.8784615384615384
0.8823076923076922
0.8861538461538462
0.89
0.8938461538461538
0.8976923076923077
0.9015384615384615
0.9053846153846153
0.9092307692307692
0.9130769230769231
0.916923076923077
0.9207692307692308
0.9246153846153846
0.9284615384615384
0.9323076923076923
0.9361538461538461
0.94
0.9438461538461538
0.9476923076923077
0.9515384615384616
0.9553846153846154
0.9592307692307692
0.963076923076923
0.9669230769230769
0.9707692307692308
0.9746153846153847
0.9784615384615385
0.9823076923076923
0.9861538461538462
0.99

plt.plot(thresholds, udsg_eff, '.-')
plt.semilogy()

[]

plt.plot(udsg_eff, c_eff, '.-', c='lime', label='c-jets')
plt.plot(udsg_eff, b_eff, 'r.-', label='b-jets')
plt.xlabel('$udsg$ mistagging rate')
plt.ylabel('($b,c$)-jet tag efficiency')
plt.xlim(0,0.02)
plt.ylim(0,1)
plt.grid()
plt.legend()
plt.tight_layout()
# plt.savefig(f'perf_comparison_LHCb_{short_exp_id(exp_id_bc_vs_udsg)}_unzoom.png')

Hadi Hassan¶

Fig. 10 in https://alice-notes.web.cern.ch/system/files/notes/analysis/982/2019-10-06-ALICE_analysis_note.pdf

idx_b = df_b.query('Jet_Pt > 30 and Jet_Pt < 40').index
idx_udsg = df_udsg.query('Jet_Pt > 30 and Jet_Pt < 40').index

b_eff, udsg_eff = [], []
b_eff_3040, udsg_eff_3040 = [], []
thresholds = np.linspace(0.001, 0.999, 50)
for t in thresholds:
    print(t)
    udsg_eff.append(sum(y_udsg_proba_bc_vs_udsg > t) / len(y_udsg_proba_bc_vs_udsg))
    b_eff.append(   sum(y_b_proba_bc_vs_udsg > t)    / len(y_b_proba_bc_vs_udsg))
    
    udsg_eff_3040.append(sum(y_udsg_proba_bc_vs_udsg[idx_udsg] > t) / len(y_udsg_proba_bc_vs_udsg[idx_udsg]))
    b_eff_3040.append(   sum(y_b_proba_bc_vs_udsg[idx_b] > t)       / len(y_b_proba_bc_vs_udsg[idx_b]))

0.001
0.02136734693877551
0.04173469387755102
0.06210204081632653
0.08246938775510204
0.10283673469387755
0.12320408163265306
0.14357142857142857
0.16393877551020408
0.1843061224489796
0.2046734693877551
0.2250408163265306
0.24540816326530612
0.26577551020408163
0.28614285714285714
0.30651020408163265
0.32687755102040816
0.3472448979591837
0.3676122448979592
0.3879795918367347
0.4083469387755102
0.4287142857142857
0.4490816326530612
0.46944897959183673
0.48981632653061224
0.5101836734693878
0.5305510204081633
0.5509183673469388
0.5712857142857143
0.5916530612244898
0.6120204081632653
0.6323877551020408
0.6527551020408163
0.6731224489795918
0.6934897959183673
0.7138571428571429
0.7342244897959184
0.7545918367346939
0.7749591836734694
0.7953265306122449
0.8156938775510204
0.8360612244897959
0.8564285714285714
0.8767959183673469
0.8971632653061224
0.917530612244898
0.9378979591836735
0.958265306122449
0.9786326530612245
0.999

plt.figure(figsize=(6,7))
plt.plot(b_eff[:-1], udsg_eff[:-1], '.-', label='$10 < p_T < 150$ GeV/c')
plt.plot(b_eff_3040[:-1], udsg_eff_3040[:-1], '.-', label='$30 < p_T < 40$ GeV/c')
plt.legend()
plt.semilogy()
plt.ylim(1e-4,1)
plt.xlim(0,1)
plt.xlabel('b-jets tag. eff.')
plt.ylabel('udsg-jets mistag. rate')
plt.grid()
plt.scatter([0.42, 0.66, 0.865], [2.8e-2, 1.05e-1, 0.38], c=['lime', 'r', 'k'])
plt.tight_layout()

# plt.savefig(f'perf_comparison_HadiHassan_{short_exp_id(exp_id_bc_vs_udsg)}.png')

Rudiger¶

Fig. 2 in https://arxiv.org/pdf/1709.08497.pdf

idx_b = df_b.query('Jet_Pt > 30 and Jet_Pt < 40').index
idx_udsg = df_udsg.query('Jet_Pt > 30 and Jet_Pt < 40').index

b_eff, c_eff, udsg_eff = [], [], []
b_eff_3040, udsg_eff_3040 = [], []
thresholds = np.linspace(0.001, 0.999, 50)
for t in thresholds:
    print(t)
    udsg_eff.append(sum(y_udsg_proba_bc_vs_udsg > t) / len(y_udsg_proba_bc_vs_udsg))
    b_eff.append(   sum(y_b_proba_bc_vs_udsg > t)    / len(y_b_proba_bc_vs_udsg))
    
    udsg_eff_3040.append(sum(y_udsg_proba_bc_vs_udsg[idx_udsg] > t) / len(y_udsg_proba_bc_vs_udsg[idx_udsg]))
    b_eff_3040.append(   sum(y_b_proba_bc_vs_udsg[idx_b] > t)       / len(y_b_proba_bc_vs_udsg[idx_b]))

0.001
0.02136734693877551
0.04173469387755102
0.06210204081632653
0.08246938775510204
0.10283673469387755
0.12320408163265306
0.14357142857142857
0.16393877551020408
0.1843061224489796
0.2046734693877551
0.2250408163265306
0.24540816326530612
0.26577551020408163
0.28614285714285714
0.30651020408163265
0.32687755102040816
0.3472448979591837
0.3676122448979592
0.3879795918367347
0.4083469387755102
0.4287142857142857
0.4490816326530612
0.46944897959183673
0.48981632653061224
0.5101836734693878
0.5305510204081633
0.5509183673469388
0.5712857142857143
0.5916530612244898
0.6120204081632653
0.6323877551020408
0.6527551020408163
0.6731224489795918
0.6934897959183673
0.7138571428571429
0.7342244897959184
0.7545918367346939
0.7749591836734694
0.7953265306122449
0.8156938775510204
0.8360612244897959
0.8564285714285714
0.8767959183673469
0.8971632653061224
0.917530612244898
0.9378979591836735
0.958265306122449
0.9786326530612245
0.999

plt.figure(figsize=(9,6))
plt.plot(b_eff[:-1], udsg_eff[:-1], '.-', label='$10 < p_T < 150$ GeV/c')
plt.plot(b_eff_3040[:-1], udsg_eff_3040[:-1], '.-', label='$30 < p_T < 40$ GeV/c')
plt.legend()
plt.semilogy()
plt.ylim(7e-8,1)
plt.xlim(0.14, 0.86)
plt.xlabel('b-jets tag. eff.')
plt.ylabel('udsg-jets mistag. rate')
plt.grid()
# plt.scatter([0.42, 0.66, 0.865], [2.8e-2, 1.05e-1, 0.38], c=['lime', 'r', 'k'])
plt.tight_layout()
plt.gca().set_yticks([1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]);

# plt.savefig(f'perf_comparison_Rudiger_{short_exp_id(exp_id_bc_vs_udsg)}.png')

Apply for data¶

# read all data and do not skip anything (nothing was used for training)
df_data = pd.read_csv('../datasets/iter2/alljets_10-150GeV_base.csv')
df_data['flavour'] = 'data'
df_data = convert_float64_to_float32(df_data)

X = scaler_bc_vs_udsg.transform(df_data[feats_bc_vs_udsg])
y_data_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_data[feats_b_vs_c])
y_data_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

Plot score distributions¶

score distributions -- all pT¶

kwargs = dict(bins=np.linspace(0,150,151), density=0, histtype='step' , lw=2)

plt.figure(figsize=(8,5))
plt.hist(df_udsg['Jet_Pt'], color='blue'   , label='udsg' , **kwargs);
plt.hist(df_c['Jet_Pt'],    color='orange' , label='c'    , **kwargs);
plt.hist(df_b['Jet_Pt'],    color='r'      , label='b'    , **kwargs);
plt.hist(df_data['Jet_Pt'], color='k'      , label='data' , **kwargs);
plt.legend()
plt.grid()
plt.semilogy()
plt.xlabel('jet $p_T^{reco}$ [GeV/c]');
plt.ylabel('counts');
# plt.savefig('pT_spectrum_by_flavour.png')

kwargs = dict(bins=np.linspace(-0.5,39.5,41), density=1, histtype='step' , lw=2)
var = 'Jet_NumTracks'

pt_queries = ['Jet_Pt > 0 and Jet_Pt < 150', 'Jet_Pt > 10 and Jet_Pt < 20', 'Jet_Pt > 20 and Jet_Pt < 30', 'Jet_Pt > 30 and Jet_Pt < 50', 'Jet_Pt > 50 and Jet_Pt < 100']
for query in pt_queries:
    plt.figure(figsize=(8,5))
    plt.hist(df_udsg[var][df_udsg.query(query).index], color='blue'   , label='udsg' , **kwargs);
    plt.hist(df_c[var][df_c.query(query).index],    color='orange' , label='c'    , **kwargs);
    plt.hist(df_b[var][df_b.query(query).index],    color='r'      , label='b'    , **kwargs);
    plt.hist(df_data[var][df_data.query(query).index], color='k'      , label='data' , **kwargs);
    plt.legend()
    plt.grid()
    plt.semilogy()
    plt.xlabel('N tracks in jet');
    plt.ylabel('probability');
# plt.savefig('pT_spectrum_by_flavour.png')

kwargs = dict(bins=np.linspace(0,1,200), density=1, histtype='step' , lw=2)

plt.figure(figsize=(8,5))
plt.hist(y_udsg_proba_bc_vs_udsg, color='blue'   , label='udsg' , **kwargs);
plt.hist(y_c_proba_bc_vs_udsg,    color='orange' , label='c'    , **kwargs);
plt.hist(y_b_proba_bc_vs_udsg,    color='r'      , label='b'    , **kwargs);
plt.hist(y_data_proba_bc_vs_udsg, color='k'      , label='data' , **kwargs);
plt.legend(loc='lower center')
plt.grid()
plt.semilogy()
plt.xlabel('score $bc$ vs $udsg$');
plt.ylabel('counts');
# plt.savefig(f'score_bc_vs_udsg_{short_exp_id(exp_id_bc_vs_udsg)}_by_flavour.png')

kwargs = dict(bins=np.linspace(0,1,200), density=1, histtype='step' , lw=2)

plt.figure(figsize=(8,5))
plt.hist(y_udsg_proba_b_vs_c, color='blue'   , label='udsg' , **kwargs);
plt.hist(y_c_proba_b_vs_c,    color='orange' , label='c'    , **kwargs);
plt.hist(y_b_proba_b_vs_c,    color='r'      , label='b'    , **kwargs);
plt.hist(y_data_proba_b_vs_c, color='k'      , label='data' , **kwargs);
plt.legend(loc='lower center')
plt.grid()
plt.semilogy()
plt.xlabel('score $b$ vs $c$');
plt.ylabel('counts');
# plt.savefig(f'score_b_vs_c_{short_exp_id(exp_id_b_vs_c)}_by_flavour.png')

for y_proba_bc_vs_udsg, y_proba_b_vs_c, flavour in zip(
                                        [y_data_proba_bc_vs_udsg, y_udsg_proba_bc_vs_udsg, y_c_proba_bc_vs_udsg, y_b_proba_bc_vs_udsg],
                                        [y_data_proba_b_vs_c    , y_udsg_proba_b_vs_c,     y_c_proba_b_vs_c,     y_b_proba_b_vs_c    ],
                                        ['data'                 , 'udsg',                  'c',                  'b'                 ],
                                    ): 
    plt.figure(figsize=(7,5))
    plt.hist2d(y_proba_bc_vs_udsg, y_proba_b_vs_c, bins=50, norm=mpl.colors.LogNorm(), cmin=50);
    plt.colorbar()
    plt.xlabel('score bc vs udsg')
    plt.ylabel('score b vs c')
    plt.title(f'{flavour}-jets')
    plt.tight_layout()
    plt.xlim([0,1])
    plt.ylim([0,1])
#     plt.savefig(f'scores_2BDTs_{flavour}_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')

/cvmfs/sft.cern.ch/lcg/views/LCG_96python3/x86_64-centos7-gcc8-opt/lib/python3.6/site-packages/matplotlib/colors.py:1062: RuntimeWarning: invalid value encountered in less_equal
  mask |= resdat <= 0

score distirbutions -- pT differential¶

pt_queries = ['Jet_Pt > 10 and Jet_Pt < 150', 'Jet_Pt > 10 and Jet_Pt < 20', 'Jet_Pt > 20 and Jet_Pt < 30', 'Jet_Pt > 30 and Jet_Pt < 40', 'Jet_Pt > 40 and Jet_Pt < 100']

kwargs = dict(bins=np.linspace(0,1,51), density=1, histtype='step' , lw=2)
logy = 0

for query in pt_queries:
    plt.figure(figsize=(8,5))
    plt.hist(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index], color='blue'   , label='udsg' , **kwargs);
    plt.hist(y_c_proba_bc_vs_udsg[df_c.query(query).index],    color='orange' , label='c'    , **kwargs);
    yrange = plt.ylim()
    plt.hist(y_b_proba_bc_vs_udsg[df_b.query(query).index],    color='r'      , label='b'    , **kwargs);
    plt.hist(y_data_proba_bc_vs_udsg[df_data.query(query).index], color='k'      , label='data' , **kwargs);
    if logy:
        plt.semilogy()
        plt.legend(loc='lower center')
        plt.ylim(1e-3, 1e2)
        suffix = ''
    else:
        plt.legend(loc=(0.6, 0.6))
#         plt.ylim(yrange[0], yrange[1]*1.2)
        plt.ylim(0,10)
        suffix = '_liny'
    plt.grid()
    plt.xlabel('score $bc$ vs $udsg$');
    plt.ylabel('probability');
    plt.text(0.05, 0.9, query, transform=plt.gca().transAxes)
    plt.tight_layout()
    pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
    fname = f'score_bc_vs_udsg_{short_exp_id(exp_id_bc_vs_udsg)}_by_flavour_pt{pt_range}{suffix}.png'
    print(fname)
#     plt.savefig(fname)

score_bc_vs_udsg_bcf99d_by_flavour_pt10-150_liny.png
score_bc_vs_udsg_bcf99d_by_flavour_pt10-20_liny.png
score_bc_vs_udsg_bcf99d_by_flavour_pt20-30_liny.png
score_bc_vs_udsg_bcf99d_by_flavour_pt30-40_liny.png
score_bc_vs_udsg_bcf99d_by_flavour_pt40-100_liny.png

kwargs = dict(bins=np.linspace(0,1,51), density=1, histtype='step' , lw=2)
logy = 0

for query in pt_queries:
    plt.figure(figsize=(8,5))
    plt.hist(y_udsg_proba_b_vs_c[df_udsg.query(query).index], color='blue'   , label='udsg' , **kwargs);
    plt.hist(y_c_proba_b_vs_c[df_c.query(query).index],    color='orange' , label='c'    , **kwargs);
    yrange = plt.ylim()
    plt.hist(y_b_proba_b_vs_c[df_b.query(query).index],    color='r'      , label='b'    , **kwargs);
    plt.hist(y_data_proba_b_vs_c[df_data.query(query).index], color='k'      , label='data' , **kwargs);
    if logy:
        plt.semilogy()
        plt.legend(loc='lower center')
        plt.ylim(1e-3,2e1)
        suffix = ''
        plt.text(0.4, 0.9, query, transform=plt.gca().transAxes)
    else:
        plt.legend(loc=(0.6, 0.6))
#         plt.ylim(yrange[0], yrange[1]*1.2)
        plt.ylim(0,10)
        suffix = '_liny'    
        plt.text(0.05, 0.9, query, transform=plt.gca().transAxes)
        
    plt.grid()
    plt.xlabel('score $b$ vs $c$');
    plt.ylabel('counts');
    plt.tight_layout()
    pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
    fname = f'score_b_vs_c_{short_exp_id(exp_id_b_vs_c)}_by_flavour_pt{pt_range}{suffix}.png'
    print(fname)
#     plt.savefig(fname)

score_b_vs_c_3ce14e_by_flavour_pt10-150_liny.png
score_b_vs_c_3ce14e_by_flavour_pt10-20_liny.png
score_b_vs_c_3ce14e_by_flavour_pt20-30_liny.png
score_b_vs_c_3ce14e_by_flavour_pt30-40_liny.png
score_b_vs_c_3ce14e_by_flavour_pt40-100_liny.png

# for q in ['Jet_Pt < 20', 'Jet_Pt > 20 and Jet_Pt < 30', 'Jet_Pt > 30 and Jet_Pt < 50', 'Jet_Pt > 50']:
n_bins = 15

for query in pt_queries:
    print()
    fig, axes = plt.subplots(ncols=4, figsize=(20,5))
    for y_proba_bc_vs_udsg, y_proba_b_vs_c, flavour, df, ax in zip(
                                            [y_data_proba_bc_vs_udsg, y_udsg_proba_bc_vs_udsg, y_c_proba_bc_vs_udsg, y_b_proba_bc_vs_udsg],
                                            [y_data_proba_b_vs_c    , y_udsg_proba_b_vs_c,     y_c_proba_b_vs_c,     y_b_proba_b_vs_c    ],
                                            ['data'                 , 'udsg',                  'c',                  'b'                 ],
                                            [df_data                ,  df_udsg               , df_c                , df_b                ],
                                            axes,
                                        ): 
        idx = df.query(query).index
        cmin = int(1e-4*len(idx))
        print(cmin)
        ax.hist2d(y_proba_bc_vs_udsg[idx], y_proba_b_vs_c[idx], 
                  bins=[np.linspace(0,1,n_bins+1), np.linspace(0,1,n_bins+1)], 
                  norm=mpl.colors.LogNorm(), 
                  cmin=cmin,
#                   density=1,
#                   vmin=1e-6, vmax=1e4,
                 );
    #     plt.colorbar()
        ax.set_xlabel('score bc vs udsg')
        ax.set_ylabel('score b vs c')
        pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
        ax.set_title(f'{flavour}-jets pT {pt_range} GeV/c')
        plt.tight_layout()
        ax.set_xlim([0,1])
        ax.set_ylim([0,1])
        fname = f'scores_2BDTs_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}_pt{pt_range}.png'
#         plt.savefig(fname)

14
25
25
25

12
4
3
4

1
3
3
5

0
2
2
4

0
12
13
9

Calculate c- and b-fractions in data¶

Template fit to score distribution - TOY¶

bins = np.linspace(0,1,101)
bin_centers = (bins[:-1]+bins[1:])/2


kwargs_np_hist = dict(bins=bins, density=0)
counts_udsg, x = np.histogram(y_udsg_proba_bc_vs_udsg, **kwargs_np_hist)
counts_c, x    = np.histogram(y_c_proba_bc_vs_udsg   , **kwargs_np_hist)
counts_b, x    = np.histogram(y_b_proba_bc_vs_udsg   , **kwargs_np_hist)

# y_test = y_udsg_proba_bc_vs_udsg*80 + y_c_proba_bc_vs_udsg*20 #+ y_b_proba_bc_vs_udsg*5 
# counts_test, x    = np.histogram(y_test   , **kwargs_np_hist)
counts_test = counts_udsg*87 + counts_c*10 + counts_b*3
noise = np.random.randint(1,100000,len(bin_centers))-50000 #+ np.linspace(0,500000, len(bin_centers)) - 250000
counts_test += noise.astype(int)



def func(x, n_udsg, n_c, n_b):
    counts = counts_udsg*n_udsg + counts_c*n_c + counts_b*n_b
    ret = []
    for x_i in x:
        v = [c for bc, c in zip(bin_centers, counts) if abs(bc-x_i) < 1e-6][0]
        ret.append(v)
    return ret
popt, pcov = curve_fit(func, bin_centers, counts_test)




plt.figure(figsize=(8,6))
plt.plot(bin_centers, counts_udsg, label='udsg')
plt.plot(bin_centers, counts_c, label='c')
plt.plot(bin_centers, counts_b, label='b')
plt.plot(bin_centers, counts_test/100,'-', lw=2, c='k', label='artif. mix. udsg+c+b + noise')
plt.plot(bin_centers, (counts_test-noise.astype(int))/100,'--', lw=2, c='k', label='artif. mix udsg+c+b')
plt.semilogy()
plt.legend()

print(f'fractions from fit: {popt}')

fractions from fit: [86.16173043  9.39438703  3.40719083]

popt

array([88.67612689,  7.76936183,  3.46615454])

Template fit to score ditribution¶

def make_fit(query, clf_type, nbins, logy, verbose=True):
    bins = np.linspace(0,1,nbins+1)
    bin_centers = (bins[:-1]+bins[1:])/2

    kwargs_np_hist = dict(bins=bins, density=0)
    if clf_type == 'bc_vs_udsg':
        counts_udsg, x = np.histogram(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index] , **kwargs_np_hist)
        counts_c, x    = np.histogram(y_c_proba_bc_vs_udsg[df_c.query(query).index]       , **kwargs_np_hist)
        counts_b, x    = np.histogram(y_b_proba_bc_vs_udsg[df_b.query(query).index]       , **kwargs_np_hist)
        counts_data, x = np.histogram(y_data_proba_bc_vs_udsg[df_data.query(query).index] , **kwargs_np_hist)
    elif clf_type == 'b_vs_c':
        counts_udsg, x = np.histogram(y_udsg_proba_b_vs_c[df_udsg.query(query).index] , **kwargs_np_hist)
        counts_c, x    = np.histogram(y_c_proba_b_vs_c[df_c.query(query).index]       , **kwargs_np_hist)
        counts_b, x    = np.histogram(y_b_proba_b_vs_c[df_b.query(query).index]       , **kwargs_np_hist)
        counts_data, x = np.histogram(y_data_proba_b_vs_c[df_data.query(query).index] , **kwargs_np_hist)
    pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])

    
    def func(x, n_udsg, n_c, n_b):
        counts = counts_udsg*n_udsg + counts_c*n_c + counts_b*n_b
        ret = []
        for x_i in x:
            v = [c for bc, c in zip(bin_centers, counts) if abs(bc-x_i) < 1e-6][0]
            ret.append(v)
        return ret
    
    
    sum_data = np.sum(counts_data)
    sum_udsg = np.sum(counts_udsg)
    fit_max = sum_data/sum_udsg
    
    sigmas = [np.sqrt(c) if c else 1e8 for c in counts_data]
    popt, pcov = curve_fit(func, bin_centers, counts_data, sigma=sigmas, 
                           bounds=([0.7*fit_max,0,0], [fit_max,fit_max,fit_max]))
    
    residuals = func(bin_centers, *popt) - counts_data
    chi2 = np.sum((residuals / sigmas)**2)
    ndf  = len(bin_centers) - 3
    if verbose: 
        print(f'results for pT = {pt_range},  {nbins} bins: \n\t\t\t fit_max={fit_max:.3f} raw frac. = {[round(p,4) for p in popt]}\n\t\t  \
            frac. = {[round(p/sum(popt),3) for p in popt]} uncert. = {[round(pc/sum(popt),5) for pc in pcov[[0,1,2], [0,1,2]]]}\n\t\t \
            chi2 / Ndf = {chi2:.1f}/{ndf} = {chi2/ndf:.1f}')
        
    
    fig,axes = plt.subplots(figsize=(16,4), ncols=2)
    kwargs_plt_hist = dict(bins=bins, density=1, histtype='step', lw=2)
    ax = axes[0]
    if clf_type == 'bc_vs_udsg':
        ax.hist(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index] , **kwargs_plt_hist, color='b', label='udsg')
        ax.hist(y_c_proba_bc_vs_udsg[df_c.query(query).index]       , **kwargs_plt_hist, color='orange', label='c')
        ax.hist(y_b_proba_bc_vs_udsg[df_b.query(query).index]       , **kwargs_plt_hist, color='r', label='r')
        ax.hist(y_data_proba_bc_vs_udsg[df_data.query(query).index] , **kwargs_plt_hist, color='k', label='data')
        axes[0].set_xlabel('score $bc$-vs-$udsg$')
        axes[1].set_xlabel('score $bc$-vs-$udsg$')
    elif clf_type == 'b_vs_c':
        ax.hist(y_udsg_proba_b_vs_c[df_udsg.query(query).index] , **kwargs_plt_hist, color='b', label='udsg')
        ax.hist(y_c_proba_b_vs_c[df_c.query(query).index]       , **kwargs_plt_hist, color='orange', label='c')
        ax.hist(y_b_proba_b_vs_c[df_b.query(query).index]       , **kwargs_plt_hist, color='r', label='r')
        ax.hist(y_data_proba_b_vs_c[df_data.query(query).index] , **kwargs_plt_hist, color='k', label='data')
        axes[0].set_xlabel('score $b$-vs-$c$')
        axes[1].set_xlabel('score $b$-vs-$c$')
        
    if logy:
        ax.legend(loc='lower center')
        ax.semilogy()
    else:
        ax.legend(loc='upper left')
        ax.set_ylim(0, 10)

    ax = axes[1]
    ax.bar(bin_centers, counts_udsg*popt[0], 1/len(bin_centers)                                             , label='udsg', color='b', alpha=0.5)
    ax.bar(bin_centers, counts_c*popt[1]   , 1/len(bin_centers), bottom=counts_udsg*popt[0]                 , label='c', color='orange', alpha=0.7)
    ax.bar(bin_centers, counts_b*popt[2]   , 1/len(bin_centers), bottom=counts_udsg*popt[0]+counts_c*popt[1], label='b', color='r', alpha=0.8)
    ax.errorbar(bin_centers, counts_data, yerr=np.sqrt(counts_data), fmt='o', color='k', alpha=0.8, label='data')
    if logy:
        ax.legend(loc='lower center')
        ax.semilogy()
        txt_pos = 0.98, 0.9
    else:
        ax.legend(loc='upper right')
        txt_pos = 0.98, 0.4
        
    ax.text(*txt_pos, f'chi2/Ndf = {chi2/ndf:.1f}', 
            transform=ax.transAxes, 
            fontsize=12, horizontalalignment='right',
           bbox=dict(facecolor='white', alpha=0.2)
           )        
    
    plt.suptitle(query)
    return axes

LINK TO HERE

Plots of template fitting¶

below you can find plots of fitting data score distribution to templates coming from MC.
Templates are shown on the left and fitted distribution on the right.
There are plots for:

scores distribution of both models: bc-vs-udsg and b-vs-c
with linear and log y-scale
for 4 $p_T$ bins: 10-20, 20-30, 30-40 and 40-100 GeV/c

bc-vs-udsg (linscale)

for query in pt_queries[1:]:
    make_fit(query, clf_type='bc_vs_udsg', nbins=50, logy=0, verbose=0)

bc-vs-udsg (logscale)

for query in pt_queries[1:]:
    make_fit(query, clf_type='bc_vs_udsg', nbins=50, logy=1, verbose=0)

b-vs-c (linscale)

for query in pt_queries[1:]:
    make_fit(query, clf_type='b_vs_c', nbins=50, logy=0, verbose=0)

b-vs-c (logscale)

for query in pt_queries[1:]:
    make_fit(query, clf_type='b_vs_c', nbins=50, logy=1, verbose=0)

Uncertainty of the fit estimation by param. variation¶

def func(x, n_udsg, n_c, n_b):
    counts = counts_udsg*n_udsg + counts_c*n_c + counts_b*n_b
    ret = []
    for x_i in x:
        v = [c for bc, c in zip(bin_centers, counts) if abs(bc-x_i) < 1e-6][0]
        ret.append(v)
    return ret



# nbins = 20
logy = 0
dpt = 0.1

fit_res = dict()

for query_unvaried in pt_queries[1:]:
    print()
    lim1,lim2 = [int(x) for x in np.array(query_unvaried.split(' '))[[2,6]]]
    fit_res[(lim1,lim2)] = dict(udsg=[], c=[], b=[])
    for lim1_varied in [lim1, lim1*(1+dpt), lim1*(1-dpt)]:
        for lim2_varied in [lim2, lim2*(1+dpt), lim2*(1-dpt)]:
            for nbins in [30, 50, 100]:
                query = f'Jet_Pt > {lim1_varied:.1f} and Jet_Pt < {lim2_varied:.1f}'

                bins = np.linspace(0,1,nbins+1)
                bin_centers = (bins[:-1]+bins[1:])/2

                kwargs_np_hist = dict(bins=bins, density=0)
#                 counts_udsg, x = np.histogram(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index] , **kwargs_np_hist)
#                 counts_c, x    = np.histogram(y_c_proba_bc_vs_udsg[df_c.query(query).index]       , **kwargs_np_hist)
#                 counts_b, x    = np.histogram(y_b_proba_bc_vs_udsg[df_b.query(query).index]       , **kwargs_np_hist)
#                 counts_data, x = np.histogram(y_data_proba_bc_vs_udsg[df_data.query(query).index] , **kwargs_np_hist)

                counts_udsg, x = np.histogram(y_udsg_proba_b_vs_c[df_udsg.query(query).index] , **kwargs_np_hist)
                counts_c, x    = np.histogram(y_c_proba_b_vs_c[df_c.query(query).index]       , **kwargs_np_hist)
                counts_b, x    = np.histogram(y_b_proba_b_vs_c[df_b.query(query).index]       , **kwargs_np_hist)
                counts_data, x = np.histogram(y_data_proba_b_vs_c[df_data.query(query).index] , **kwargs_np_hist)

                pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])

                sum_data = np.sum(counts_data)
                sum_udsg = np.sum(counts_udsg)
                fit_max = sum_data/sum_udsg
                
                sigmas = [np.sqrt(c) if c else 1e8 for c in counts_data]
                popt, pcov = curve_fit(func, bin_centers, counts_data, sigma=sigmas, 
                                       bounds=([fit_max*0.7, 0, 0], fit_max))

                residuals = func(bin_centers, *popt) - counts_data
                chi2 = np.sum((residuals / sigmas)**2)
                ndf  = len(bin_centers) - 3
                
                print(f'q={query}, nbins={nbins}: \tchi2/Ndf = {chi2/ndf:.1f} \t frac = {[round(p/sum(popt),3) for p in popt]}')
               
                fit_res[(lim1,lim2)]['udsg'].append(popt[0]/sum(popt))
                fit_res[(lim1,lim2)]['c'].append(popt[1]/sum(popt))
                fit_res[(lim1,lim2)]['b'].append(popt[2]/sum(popt))
#                 print(f'results for pT = {pt_range},  {nbins} bins: {[round(p,4) for p in popt]} {[round(p/sum(popt),3) for p in popt]}\t{[round(pc/sum(popt),5) for pc in pcov[[0,1,2], [0,1,2]]]}')

q=Jet_Pt > 10.0 and Jet_Pt < 20.0, nbins=30: 	chi2/Ndf = 205.0 	 frac = [0.746, 0.241, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 20.0, nbins=50: 	chi2/Ndf = 123.6 	 frac = [0.747, 0.24, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 20.0, nbins=100: 	chi2/Ndf = 62.2 	 frac = [0.748, 0.239, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 22.0, nbins=30: 	chi2/Ndf = 290.5 	 frac = [0.751, 0.236, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 22.0, nbins=50: 	chi2/Ndf = 171.6 	 frac = [0.752, 0.235, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 22.0, nbins=100: 	chi2/Ndf = 85.1 	 frac = [0.753, 0.235, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 18.0, nbins=30: 	chi2/Ndf = 163.6 	 frac = [0.739, 0.247, 0.014]
q=Jet_Pt > 10.0 and Jet_Pt < 18.0, nbins=50: 	chi2/Ndf = 98.1 	 frac = [0.74, 0.246, 0.014]
q=Jet_Pt > 10.0 and Jet_Pt < 18.0, nbins=100: 	chi2/Ndf = 49.7 	 frac = [0.741, 0.245, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 20.0, nbins=30: 	chi2/Ndf = 120.2 	 frac = [0.754, 0.232, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 20.0, nbins=50: 	chi2/Ndf = 73.3 	 frac = [0.754, 0.231, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 20.0, nbins=100: 	chi2/Ndf = 37.6 	 frac = [0.756, 0.23, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 22.0, nbins=30: 	chi2/Ndf = 169.1 	 frac = [0.756, 0.229, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 22.0, nbins=50: 	chi2/Ndf = 100.1 	 frac = [0.757, 0.229, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 22.0, nbins=100: 	chi2/Ndf = 50.2 	 frac = [0.758, 0.228, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 18.0, nbins=30: 	chi2/Ndf = 95.9 	 frac = [0.747, 0.237, 0.016]
q=Jet_Pt > 11.0 and Jet_Pt < 18.0, nbins=50: 	chi2/Ndf = 57.9 	 frac = [0.748, 0.236, 0.016]
q=Jet_Pt > 11.0 and Jet_Pt < 18.0, nbins=100: 	chi2/Ndf = 30.0 	 frac = [0.75, 0.235, 0.016]
q=Jet_Pt > 9.0 and Jet_Pt < 20.0, nbins=30: 	chi2/Ndf = 205.0 	 frac = [0.746, 0.241, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 20.0, nbins=50: 	chi2/Ndf = 123.6 	 frac = [0.747, 0.24, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 20.0, nbins=100: 	chi2/Ndf = 62.2 	 frac = [0.748, 0.239, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 22.0, nbins=30: 	chi2/Ndf = 290.5 	 frac = [0.751, 0.236, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 22.0, nbins=50: 	chi2/Ndf = 171.6 	 frac = [0.752, 0.235, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 22.0, nbins=100: 	chi2/Ndf = 85.1 	 frac = [0.753, 0.235, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 18.0, nbins=30: 	chi2/Ndf = 163.6 	 frac = [0.739, 0.247, 0.014]
q=Jet_Pt > 9.0 and Jet_Pt < 18.0, nbins=50: 	chi2/Ndf = 98.1 	 frac = [0.74, 0.246, 0.014]
q=Jet_Pt > 9.0 and Jet_Pt < 18.0, nbins=100: 	chi2/Ndf = 49.7 	 frac = [0.741, 0.245, 0.014]

q=Jet_Pt > 20.0 and Jet_Pt < 30.0, nbins=30: 	chi2/Ndf = 11.5 	 frac = [0.789, 0.191, 0.02]
q=Jet_Pt > 20.0 and Jet_Pt < 30.0, nbins=50: 	chi2/Ndf = 7.0 	 frac = [0.792, 0.188, 0.02]
q=Jet_Pt > 20.0 and Jet_Pt < 30.0, nbins=100: 	chi2/Ndf = 4.5 	 frac = [0.796, 0.187, 0.018]
q=Jet_Pt > 20.0 and Jet_Pt < 33.0, nbins=30: 	chi2/Ndf = 18.5 	 frac = [0.79, 0.19, 0.02]
q=Jet_Pt > 20.0 and Jet_Pt < 33.0, nbins=50: 	chi2/Ndf = 10.6 	 frac = [0.793, 0.188, 0.019]
q=Jet_Pt > 20.0 and Jet_Pt < 33.0, nbins=100: 	chi2/Ndf = 6.4 	 frac = [0.797, 0.186, 0.017]
q=Jet_Pt > 20.0 and Jet_Pt < 27.0, nbins=30: 	chi2/Ndf = 7.8 	 frac = [0.788, 0.192, 0.02]
q=Jet_Pt > 20.0 and Jet_Pt < 27.0, nbins=50: 	chi2/Ndf = 5.2 	 frac = [0.791, 0.189, 0.02]
q=Jet_Pt > 20.0 and Jet_Pt < 27.0, nbins=100: 	chi2/Ndf = 3.6 	 frac = [0.795, 0.188, 0.017]
q=Jet_Pt > 22.0 and Jet_Pt < 30.0, nbins=30: 	chi2/Ndf = 5.8 	 frac = [0.795, 0.187, 0.018]
q=Jet_Pt > 22.0 and Jet_Pt < 30.0, nbins=50: 	chi2/Ndf = 3.6 	 frac = [0.798, 0.184, 0.018]
q=Jet_Pt > 22.0 and Jet_Pt < 30.0, nbins=100: 	chi2/Ndf = 2.9 	 frac = [0.808, 0.177, 0.015]
q=Jet_Pt > 22.0 and Jet_Pt < 33.0, nbins=30: 	chi2/Ndf = 9.6 	 frac = [0.794, 0.187, 0.018]
q=Jet_Pt > 22.0 and Jet_Pt < 33.0, nbins=50: 	chi2/Ndf = 5.6 	 frac = [0.797, 0.185, 0.018]
q=Jet_Pt > 22.0 and Jet_Pt < 33.0, nbins=100: 	chi2/Ndf = 4.1 	 frac = [0.808, 0.177, 0.015]
q=Jet_Pt > 22.0 and Jet_Pt < 27.0, nbins=30: 	chi2/Ndf = 4.7 	 frac = [0.795, 0.187, 0.018]
q=Jet_Pt > 22.0 and Jet_Pt < 27.0, nbins=50: 	chi2/Ndf = 2.9 	 frac = [0.798, 0.185, 0.017]
q=Jet_Pt > 22.0 and Jet_Pt < 27.0, nbins=100: 	chi2/Ndf = 2.3 	 frac = [0.809, 0.177, 0.013]
q=Jet_Pt > 18.0 and Jet_Pt < 30.0, nbins=30: 	chi2/Ndf = 22.4 	 frac = [0.786, 0.197, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 30.0, nbins=50: 	chi2/Ndf = 12.8 	 frac = [0.787, 0.195, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 30.0, nbins=100: 	chi2/Ndf = 7.2 	 frac = [0.79, 0.193, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 33.0, nbins=30: 	chi2/Ndf = 34.4 	 frac = [0.787, 0.194, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 33.0, nbins=50: 	chi2/Ndf = 19.1 	 frac = [0.789, 0.193, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 33.0, nbins=100: 	chi2/Ndf = 10.4 	 frac = [0.791, 0.191, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 27.0, nbins=30: 	chi2/Ndf = 15.8 	 frac = [0.784, 0.199, 0.017]
q=Jet_Pt > 18.0 and Jet_Pt < 27.0, nbins=50: 	chi2/Ndf = 9.4 	 frac = [0.786, 0.196, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 27.0, nbins=100: 	chi2/Ndf = 5.6 	 frac = [0.789, 0.194, 0.017]

q=Jet_Pt > 30.0 and Jet_Pt < 40.0, nbins=30: 	chi2/Ndf = 3.6 	 frac = [0.823, 0.164, 0.013]
q=Jet_Pt > 30.0 and Jet_Pt < 40.0, nbins=50: 	chi2/Ndf = 2.5 	 frac = [0.828, 0.162, 0.01]
q=Jet_Pt > 30.0 and Jet_Pt < 40.0, nbins=100: 	chi2/Ndf = 1.7 	 frac = [0.845, 0.142, 0.013]
q=Jet_Pt > 30.0 and Jet_Pt < 44.0, nbins=30: 	chi2/Ndf = 34.4 	 frac = [0.847, 0.123, 0.03]
q=Jet_Pt > 30.0 and Jet_Pt < 44.0, nbins=50: 	chi2/Ndf = 20.4 	 frac = [0.852, 0.122, 0.026]
q=Jet_Pt > 30.0 and Jet_Pt < 44.0, nbins=100: 	chi2/Ndf = 10.9 	 frac = [0.876, 0.087, 0.037]
q=Jet_Pt > 30.0 and Jet_Pt < 36.0, nbins=30: 	chi2/Ndf = 2.3 	 frac = [0.825, 0.151, 0.024]
q=Jet_Pt > 30.0 and Jet_Pt < 36.0, nbins=50: 	chi2/Ndf = 1.6 	 frac = [0.835, 0.145, 0.02]
q=Jet_Pt > 30.0 and Jet_Pt < 36.0, nbins=100: 	chi2/Ndf = 1.5 	 frac = [0.845, 0.142, 0.013]
q=Jet_Pt > 33.0 and Jet_Pt < 40.0, nbins=30: 	chi2/Ndf = 3.3 	 frac = [0.868, 0.117, 0.014]
q=Jet_Pt > 33.0 and Jet_Pt < 40.0, nbins=50: 	chi2/Ndf = 2.6 	 frac = [0.899, 0.087, 0.014]
q=Jet_Pt > 33.0 and Jet_Pt < 40.0, nbins=100: 	chi2/Ndf = 1.7 	 frac = [0.903, 0.082, 0.016]
q=Jet_Pt > 33.0 and Jet_Pt < 44.0, nbins=30: 	chi2/Ndf = 16.3 	 frac = [0.839, 0.128, 0.033]
q=Jet_Pt > 33.0 and Jet_Pt < 44.0, nbins=50: 	chi2/Ndf = 9.8 	 frac = [0.847, 0.122, 0.032]
q=Jet_Pt > 33.0 and Jet_Pt < 44.0, nbins=100: 	chi2/Ndf = 5.5 	 frac = [0.871, 0.085, 0.044]
q=Jet_Pt > 33.0 and Jet_Pt < 36.0, nbins=30: 	chi2/Ndf = 2.2 	 frac = [0.928, 0.059, 0.014]
q=Jet_Pt > 33.0 and Jet_Pt < 36.0, nbins=50: 	chi2/Ndf = 1.8 	 frac = [0.915, 0.042, 0.043]
q=Jet_Pt > 33.0 and Jet_Pt < 36.0, nbins=100: 	chi2/Ndf = 1.3 	 frac = [0.938, 0.019, 0.043]
q=Jet_Pt > 27.0 and Jet_Pt < 40.0, nbins=30: 	chi2/Ndf = 5.1 	 frac = [0.812, 0.17, 0.019]
q=Jet_Pt > 27.0 and Jet_Pt < 40.0, nbins=50: 	chi2/Ndf = 3.4 	 frac = [0.817, 0.166, 0.017]
q=Jet_Pt > 27.0 and Jet_Pt < 40.0, nbins=100: 	chi2/Ndf = 2.6 	 frac = [0.835, 0.15, 0.015]
q=Jet_Pt > 27.0 and Jet_Pt < 44.0, nbins=30: 	chi2/Ndf = 71.4 	 frac = [0.881, 0.075, 0.045]
q=Jet_Pt > 27.0 and Jet_Pt < 44.0, nbins=50: 	chi2/Ndf = 41.9 	 frac = [0.886, 0.071, 0.043]
q=Jet_Pt > 27.0 and Jet_Pt < 44.0, nbins=100: 	chi2/Ndf = 22.6 	 frac = [0.913, 0.045, 0.041]
q=Jet_Pt > 27.0 and Jet_Pt < 36.0, nbins=30: 	chi2/Ndf = 3.7 	 frac = [0.811, 0.173, 0.016]
q=Jet_Pt > 27.0 and Jet_Pt < 36.0, nbins=50: 	chi2/Ndf = 2.6 	 frac = [0.817, 0.167, 0.015]
q=Jet_Pt > 27.0 and Jet_Pt < 36.0, nbins=100: 	chi2/Ndf = 2.3 	 frac = [0.848, 0.14, 0.012]

q=Jet_Pt > 40.0 and Jet_Pt < 100.0, nbins=30: 	chi2/Ndf = 4.2 	 frac = [0.854, 0.104, 0.042]
q=Jet_Pt > 40.0 and Jet_Pt < 100.0, nbins=50: 	chi2/Ndf = 2.6 	 frac = [0.853, 0.094, 0.053]
q=Jet_Pt > 40.0 and Jet_Pt < 100.0, nbins=100: 	chi2/Ndf = 1.3 	 frac = [0.809, 0.138, 0.053]
q=Jet_Pt > 40.0 and Jet_Pt < 110.0, nbins=30: 	chi2/Ndf = 4.5 	 frac = [0.852, 0.105, 0.043]
q=Jet_Pt > 40.0 and Jet_Pt < 110.0, nbins=50: 	chi2/Ndf = 2.8 	 frac = [0.852, 0.094, 0.054]
q=Jet_Pt > 40.0 and Jet_Pt < 110.0, nbins=100: 	chi2/Ndf = 1.4 	 frac = [0.808, 0.138, 0.053]
q=Jet_Pt > 40.0 and Jet_Pt < 90.0, nbins=30: 	chi2/Ndf = 3.7 	 frac = [0.855, 0.104, 0.042]
q=Jet_Pt > 40.0 and Jet_Pt < 90.0, nbins=50: 	chi2/Ndf = 2.3 	 frac = [0.854, 0.093, 0.053]
q=Jet_Pt > 40.0 and Jet_Pt < 90.0, nbins=100: 	chi2/Ndf = 1.2 	 frac = [0.809, 0.138, 0.053]
q=Jet_Pt > 44.0 and Jet_Pt < 100.0, nbins=30: 	chi2/Ndf = 3.1 	 frac = [0.828, 0.133, 0.04]
q=Jet_Pt > 44.0 and Jet_Pt < 100.0, nbins=50: 	chi2/Ndf = 2.0 	 frac = [0.833, 0.13, 0.038]
q=Jet_Pt > 44.0 and Jet_Pt < 100.0, nbins=100: 	chi2/Ndf = 1.3 	 frac = [0.812, 0.1, 0.088]
q=Jet_Pt > 44.0 and Jet_Pt < 110.0, nbins=30: 	chi2/Ndf = 3.3 	 frac = [0.826, 0.135, 0.04]
q=Jet_Pt > 44.0 and Jet_Pt < 110.0, nbins=50: 	chi2/Ndf = 2.2 	 frac = [0.832, 0.13, 0.038]
q=Jet_Pt > 44.0 and Jet_Pt < 110.0, nbins=100: 	chi2/Ndf = 1.4 	 frac = [0.809, 0.102, 0.089]
q=Jet_Pt > 44.0 and Jet_Pt < 90.0, nbins=30: 	chi2/Ndf = 2.8 	 frac = [0.829, 0.131, 0.039]
q=Jet_Pt > 44.0 and Jet_Pt < 90.0, nbins=50: 	chi2/Ndf = 1.8 	 frac = [0.835, 0.127, 0.038]
q=Jet_Pt > 44.0 and Jet_Pt < 90.0, nbins=100: 	chi2/Ndf = 1.2 	 frac = [0.813, 0.099, 0.088]
q=Jet_Pt > 36.0 and Jet_Pt < 100.0, nbins=30: 	chi2/Ndf = 14.3 	 frac = [0.845, 0.095, 0.06]
q=Jet_Pt > 36.0 and Jet_Pt < 100.0, nbins=50: 	chi2/Ndf = 8.8 	 frac = [0.865, 0.074, 0.061]
q=Jet_Pt > 36.0 and Jet_Pt < 100.0, nbins=100: 	chi2/Ndf = 4.5 	 frac = [0.845, 0.099, 0.057]
q=Jet_Pt > 36.0 and Jet_Pt < 110.0, nbins=30: 	chi2/Ndf = 15.3 	 frac = [0.846, 0.092, 0.062]
q=Jet_Pt > 36.0 and Jet_Pt < 110.0, nbins=50: 	chi2/Ndf = 9.4 	 frac = [0.867, 0.07, 0.063]
q=Jet_Pt > 36.0 and Jet_Pt < 110.0, nbins=100: 	chi2/Ndf = 4.8 	 frac = [0.847, 0.095, 0.058]
q=Jet_Pt > 36.0 and Jet_Pt < 90.0, nbins=30: 	chi2/Ndf = 12.9 	 frac = [0.843, 0.099, 0.058]
q=Jet_Pt > 36.0 and Jet_Pt < 90.0, nbins=50: 	chi2/Ndf = 8.0 	 frac = [0.863, 0.077, 0.059]
q=Jet_Pt > 36.0 and Jet_Pt < 90.0, nbins=100: 	chi2/Ndf = 4.1 	 frac = [0.842, 0.104, 0.055]

x1,x2 = [],[]
x = []
xerr = []
y = []
yerr = []
for k,v in fit_res.items():
    print(k)
    x1.append(k[0])
    x2.append(k[1])
    x.append((k[0]+k[1])/2)
    xerr.append((k[1]-k[0])/2)
    r = v['c']
    y.append(np.median(r))
    yerr.append(np.std(r))
#     yerr.append(np.percentile(r,75) - np.percentile(r,25))
    
plt.figure(figsize=(6,5))
plt.errorbar(x,y,yerr=yerr,xerr=xerr, fmt='o');
plt.ylim(bottom=0)
plt.grid()
plt.xlabel('$p_T^{\,jet}$ [GeV/c]');
plt.ylabel('$c$-fraction')
plt.title('based on template fit to $b$-vs-$c$ score distr.', fontsize=12)
plt.tight_layout()
# plt.savefig('c-frac_template-fit_b-vs-c.png')

(10, 20)
(20, 30)
(30, 40)
(40, 100)

fit_res[(30,40)]['b']

[0.010982703552412015,
 0.03031037100220878,
 0.010664275394406815,
 0.015066153466698592,
 0.03782409329717273,
 0.04248267628406001,
 0.012062322474706578,
 0.03398082278783701,
 0.01073674833724234]

plt.hist(fit_res[(30,40)]['c'], bins=10);

LINK TO PLOTS OF TEMPLATE FITTING¶