In [1]:
from comet_ml import Experiment
import comet_ml
import pickle

import os
from functools import partial

import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as acc, f1_score, roc_curve, roc_auc_score, classification_report, confusion_matrix, auc
from xgboost import XGBClassifier
from scipy.optimize import curve_fit
In [3]:
import sys 
sys.path.append('..')
from helper.plotting import plot_roc, plot_score_vs_pt, plot_tagging_eff, plot_confusion_matrix, plot_xgb_learning_curve, plot_score_distr, plot_signal_significance
from helper.utils import signal_eff, get_optimal_threshold, convert_float64_to_float32, save_model, printmd
In [4]:
plt.rcParams['font.size']=16
pd.options.display.max_columns = 200

Load data from csv

In [5]:
nrows_b    = 250000
nrows_c    = 250000
nrows_udsg = 250000

skiprows   = 200000
In [8]:
df_b = pd.read_csv('../datasets/iter2/bjets_10-150GeV_base.csv', nrows=nrows_b, skiprows=range(1,skiprows))
df_b['flavour'] = 'b'
df_b = convert_float64_to_float32(df_b)
In [9]:
df_c = pd.read_csv('../datasets/iter2/cjets_10-150GeV_base.csv', nrows=nrows_c, skiprows=range(1,skiprows))
df_c['flavour'] = 'c'
df_c = convert_float64_to_float32(df_c)
In [10]:
df_udsg = pd.read_csv('../datasets/iter2/udsgjets_10-150GeV_base.csv', nrows=nrows_udsg, skiprows=range(1,skiprows))
df_udsg['flavour'] = 'udsg'
df_udsg = convert_float64_to_float32(df_udsg)

Load models from comet.ml

TODO:

  • ?? select_best_model(metric='test_roc_auc', constrains='test_metric / train_metric < 1.05')
In [11]:
def get_model_from_exp(exp_id, model_type=XGBClassifier, featnames_type=(pd.Index, pd.Series, np.array, list), scaler_type=StandardScaler, api=comet_ml.API()):
    exp = api.get(exp_id)
    assets = exp.get_model_asset_list(exp.get_model_names()[0])
    asset_id_model     = assets[  ['model' in a['fileName']     for a in assets].index(True)  ]['assetId']
    asset_id_featnames = assets[  ['feat' in a['fileName'] for a in assets].index(True)  ]['assetId']
    asset_id_scaler    = assets[  ['scaler' in a['fileName']    for a in assets].index(True)  ]['assetId']

    model_bin = exp.get_asset(asset_id_model)
    model = pickle.loads(model_bin)
    assert isinstance(model, model_type)
    
    featnames_bin = exp.get_asset(asset_id_featnames)
    featnames = pickle.loads(featnames_bin)
    assert isinstance(featnames, featnames_type)
    
    scaler_bin = exp.get_asset(asset_id_scaler)
    scaler = pickle.loads(scaler_bin)
    assert isinstance(scaler, scaler_type)
    
    return model, np.array(featnames), scaler
In [12]:
exp_id_bc_vs_udsg = 'phd/bc-vs-udsg/bcf99db8f5a94b2184e6e13161c50bbe'
# exp_id_bc_vs_udsg = 'phd/bc-vs-udsg/61c014a2ff7c49e8bae9ec466ffaa998'
exp_id_b_vs_c     = 'phd/b-vs-c/3ce14e4e99d54283bc66eb24c98b6468' 
clf_bc_vs_udsg , feats_bc_vs_udsg, scaler_bc_vs_udsg = get_model_from_exp(exp_id_bc_vs_udsg)
clf_b_vs_c     , feats_b_vs_c    , scaler_b_vs_c     = get_model_from_exp(exp_id_b_vs_c)
feats_all = np.unique(np.hstack([feats_bc_vs_udsg, feats_b_vs_c]))

def short_exp_id(exp_id):
    return exp_id.split('/')[-1][:6]

Apply models

In [13]:
X = scaler_bc_vs_udsg.transform(df_b[feats_bc_vs_udsg])
y_b_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_b[feats_b_vs_c])
y_b_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

X = scaler_bc_vs_udsg.transform(df_c[feats_bc_vs_udsg])
y_c_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_c[feats_b_vs_c])
y_c_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

X = scaler_bc_vs_udsg.transform(df_udsg[feats_bc_vs_udsg])
y_udsg_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_udsg[feats_b_vs_c])
y_udsg_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

Plot scores distributions

2D histos / scatterplots for each flavour

In [11]:
n = 100000
alpha = 1

fig,ax = plt.subplots(figsize=(10,8))
ax.plot(y_udsg_proba_bc_vs_udsg[:n], y_udsg_proba_b_vs_c[:n], ',', c='b', alpha=alpha)
ax.plot(y_c_proba_bc_vs_udsg[:n], y_c_proba_b_vs_c[:n], ',', c='orange', alpha=alpha)
ax.plot(y_b_proba_bc_vs_udsg[:n], y_b_proba_b_vs_c[:n], ',', c='r', alpha=alpha)
ax.set_xlabel('score bc vs udsg')
ax.set_ylabel('score b vs c')
ax.set_xlim([0,1])
ax.set_ylim([0,1])
# plt.savefig(f'scores_2BDTs_all-flavours_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')
Out[11]:
(0, 1)
In [12]:
for y_proba_bc_vs_udsg, y_proba_b_vs_c, flavour in zip(
                                        [y_udsg_proba_bc_vs_udsg, y_c_proba_bc_vs_udsg, y_b_proba_bc_vs_udsg],
                                        [y_udsg_proba_b_vs_c,     y_c_proba_b_vs_c,     y_b_proba_b_vs_c],
                                        ['udsg',                  'c',                  'b'],
                                    ):
    plt.figure(figsize=(7,5))
    plt.hist2d(y_proba_bc_vs_udsg, y_proba_b_vs_c, bins=50, norm=mpl.colors.LogNorm(), vmin=10, vmax=3000);
    plt.colorbar()
    plt.xlabel('score bc vs udsg')
    plt.ylabel('score b vs c')
    plt.title(f'{flavour}-jets')
    plt.tight_layout()
    plt.xlim([0,1])
    plt.ylim([0,1])
#     plt.savefig(f'scores_2BDTs_{flavour}_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')

Comparison with others

In [13]:
b_eff, c_eff, udsg_eff = [], [], []
thresholds = np.linspace(0.84, 0.99, 40)
for t in thresholds:
    print(t)
    udsg_eff.append(sum(y_udsg_proba_bc_vs_udsg > t) / len(y_udsg_proba_bc_vs_udsg))
    c_eff.append(   sum(y_c_proba_bc_vs_udsg > t)    / len(y_c_proba_bc_vs_udsg))
    b_eff.append(   sum(y_b_proba_bc_vs_udsg > t)    / len(y_b_proba_bc_vs_udsg))
0.84
0.8438461538461538
0.8476923076923076
0.8515384615384615
0.8553846153846154
0.8592307692307692
0.8630769230769231
0.8669230769230769
0.8707692307692307
0.8746153846153846
0.8784615384615384
0.8823076923076922
0.8861538461538462
0.89
0.8938461538461538
0.8976923076923077
0.9015384615384615
0.9053846153846153
0.9092307692307692
0.9130769230769231
0.916923076923077
0.9207692307692308
0.9246153846153846
0.9284615384615384
0.9323076923076923
0.9361538461538461
0.94
0.9438461538461538
0.9476923076923077
0.9515384615384616
0.9553846153846154
0.9592307692307692
0.963076923076923
0.9669230769230769
0.9707692307692308
0.9746153846153847
0.9784615384615385
0.9823076923076923
0.9861538461538462
0.99
In [14]:
plt.plot(thresholds, udsg_eff, '.-')
plt.semilogy()
Out[14]:
[]
In [15]:
plt.plot(udsg_eff, c_eff, '.-', c='lime', label='c-jets')
plt.plot(udsg_eff, b_eff, 'r.-', label='b-jets')
plt.xlabel('$udsg$ mistagging rate')
plt.ylabel('($b,c$)-jet tag efficiency')
plt.xlim(0,0.02)
plt.ylim(0,1)
plt.grid()
plt.legend()
plt.tight_layout()
# plt.savefig(f'perf_comparison_LHCb_{short_exp_id(exp_id_bc_vs_udsg)}_unzoom.png')
In [16]:
idx_b = df_b.query('Jet_Pt > 30 and Jet_Pt < 40').index
idx_udsg = df_udsg.query('Jet_Pt > 30 and Jet_Pt < 40').index
In [17]:
b_eff, udsg_eff = [], []
b_eff_3040, udsg_eff_3040 = [], []
thresholds = np.linspace(0.001, 0.999, 50)
for t in thresholds:
    print(t)
    udsg_eff.append(sum(y_udsg_proba_bc_vs_udsg > t) / len(y_udsg_proba_bc_vs_udsg))
    b_eff.append(   sum(y_b_proba_bc_vs_udsg > t)    / len(y_b_proba_bc_vs_udsg))
    
    udsg_eff_3040.append(sum(y_udsg_proba_bc_vs_udsg[idx_udsg] > t) / len(y_udsg_proba_bc_vs_udsg[idx_udsg]))
    b_eff_3040.append(   sum(y_b_proba_bc_vs_udsg[idx_b] > t)       / len(y_b_proba_bc_vs_udsg[idx_b]))
0.001
0.02136734693877551
0.04173469387755102
0.06210204081632653
0.08246938775510204
0.10283673469387755
0.12320408163265306
0.14357142857142857
0.16393877551020408
0.1843061224489796
0.2046734693877551
0.2250408163265306
0.24540816326530612
0.26577551020408163
0.28614285714285714
0.30651020408163265
0.32687755102040816
0.3472448979591837
0.3676122448979592
0.3879795918367347
0.4083469387755102
0.4287142857142857
0.4490816326530612
0.46944897959183673
0.48981632653061224
0.5101836734693878
0.5305510204081633
0.5509183673469388
0.5712857142857143
0.5916530612244898
0.6120204081632653
0.6323877551020408
0.6527551020408163
0.6731224489795918
0.6934897959183673
0.7138571428571429
0.7342244897959184
0.7545918367346939
0.7749591836734694
0.7953265306122449
0.8156938775510204
0.8360612244897959
0.8564285714285714
0.8767959183673469
0.8971632653061224
0.917530612244898
0.9378979591836735
0.958265306122449
0.9786326530612245
0.999
In [18]:
plt.figure(figsize=(6,7))
plt.plot(b_eff[:-1], udsg_eff[:-1], '.-', label='$10 < p_T < 150$ GeV/c')
plt.plot(b_eff_3040[:-1], udsg_eff_3040[:-1], '.-', label='$30 < p_T < 40$ GeV/c')
plt.legend()
plt.semilogy()
plt.ylim(1e-4,1)
plt.xlim(0,1)
plt.xlabel('b-jets tag. eff.')
plt.ylabel('udsg-jets mistag. rate')
plt.grid()
plt.scatter([0.42, 0.66, 0.865], [2.8e-2, 1.05e-1, 0.38], c=['lime', 'r', 'k'])
plt.tight_layout()

# plt.savefig(f'perf_comparison_HadiHassan_{short_exp_id(exp_id_bc_vs_udsg)}.png')
In [19]:
idx_b = df_b.query('Jet_Pt > 30 and Jet_Pt < 40').index
idx_udsg = df_udsg.query('Jet_Pt > 30 and Jet_Pt < 40').index
In [20]:
b_eff, c_eff, udsg_eff = [], [], []
b_eff_3040, udsg_eff_3040 = [], []
thresholds = np.linspace(0.001, 0.999, 50)
for t in thresholds:
    print(t)
    udsg_eff.append(sum(y_udsg_proba_bc_vs_udsg > t) / len(y_udsg_proba_bc_vs_udsg))
    b_eff.append(   sum(y_b_proba_bc_vs_udsg > t)    / len(y_b_proba_bc_vs_udsg))
    
    udsg_eff_3040.append(sum(y_udsg_proba_bc_vs_udsg[idx_udsg] > t) / len(y_udsg_proba_bc_vs_udsg[idx_udsg]))
    b_eff_3040.append(   sum(y_b_proba_bc_vs_udsg[idx_b] > t)       / len(y_b_proba_bc_vs_udsg[idx_b]))
0.001
0.02136734693877551
0.04173469387755102
0.06210204081632653
0.08246938775510204
0.10283673469387755
0.12320408163265306
0.14357142857142857
0.16393877551020408
0.1843061224489796
0.2046734693877551
0.2250408163265306
0.24540816326530612
0.26577551020408163
0.28614285714285714
0.30651020408163265
0.32687755102040816
0.3472448979591837
0.3676122448979592
0.3879795918367347
0.4083469387755102
0.4287142857142857
0.4490816326530612
0.46944897959183673
0.48981632653061224
0.5101836734693878
0.5305510204081633
0.5509183673469388
0.5712857142857143
0.5916530612244898
0.6120204081632653
0.6323877551020408
0.6527551020408163
0.6731224489795918
0.6934897959183673
0.7138571428571429
0.7342244897959184
0.7545918367346939
0.7749591836734694
0.7953265306122449
0.8156938775510204
0.8360612244897959
0.8564285714285714
0.8767959183673469
0.8971632653061224
0.917530612244898
0.9378979591836735
0.958265306122449
0.9786326530612245
0.999
In [21]:
plt.figure(figsize=(9,6))
plt.plot(b_eff[:-1], udsg_eff[:-1], '.-', label='$10 < p_T < 150$ GeV/c')
plt.plot(b_eff_3040[:-1], udsg_eff_3040[:-1], '.-', label='$30 < p_T < 40$ GeV/c')
plt.legend()
plt.semilogy()
plt.ylim(7e-8,1)
plt.xlim(0.14, 0.86)
plt.xlabel('b-jets tag. eff.')
plt.ylabel('udsg-jets mistag. rate')
plt.grid()
# plt.scatter([0.42, 0.66, 0.865], [2.8e-2, 1.05e-1, 0.38], c=['lime', 'r', 'k'])
plt.tight_layout()
plt.gca().set_yticks([1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]);

# plt.savefig(f'perf_comparison_Rudiger_{short_exp_id(exp_id_bc_vs_udsg)}.png')

Apply for data

In [17]:
# read all data and do not skip anything (nothing was used for training)
df_data = pd.read_csv('../datasets/iter2/alljets_10-150GeV_base.csv')
df_data['flavour'] = 'data'
df_data = convert_float64_to_float32(df_data)
In [18]:
X = scaler_bc_vs_udsg.transform(df_data[feats_bc_vs_udsg])
y_data_proba_bc_vs_udsg = clf_bc_vs_udsg.predict_proba(X)[:,1]
X = scaler_b_vs_c.transform(df_data[feats_b_vs_c])
y_data_proba_b_vs_c = clf_b_vs_c.predict_proba(X)[:,1]

Plot score distributions

score distributions -- all pT

In [15]:
kwargs = dict(bins=np.linspace(0,150,151), density=0, histtype='step' , lw=2)

plt.figure(figsize=(8,5))
plt.hist(df_udsg['Jet_Pt'], color='blue'   , label='udsg' , **kwargs);
plt.hist(df_c['Jet_Pt'],    color='orange' , label='c'    , **kwargs);
plt.hist(df_b['Jet_Pt'],    color='r'      , label='b'    , **kwargs);
plt.hist(df_data['Jet_Pt'], color='k'      , label='data' , **kwargs);
plt.legend()
plt.grid()
plt.semilogy()
plt.xlabel('jet $p_T^{reco}$ [GeV/c]');
plt.ylabel('counts');
# plt.savefig('pT_spectrum_by_flavour.png')
In [16]:
kwargs = dict(bins=np.linspace(-0.5,39.5,41), density=1, histtype='step' , lw=2)
var = 'Jet_NumTracks'

pt_queries = ['Jet_Pt > 0 and Jet_Pt < 150', 'Jet_Pt > 10 and Jet_Pt < 20', 'Jet_Pt > 20 and Jet_Pt < 30', 'Jet_Pt > 30 and Jet_Pt < 50', 'Jet_Pt > 50 and Jet_Pt < 100']
for query in pt_queries:
    plt.figure(figsize=(8,5))
    plt.hist(df_udsg[var][df_udsg.query(query).index], color='blue'   , label='udsg' , **kwargs);
    plt.hist(df_c[var][df_c.query(query).index],    color='orange' , label='c'    , **kwargs);
    plt.hist(df_b[var][df_b.query(query).index],    color='r'      , label='b'    , **kwargs);
    plt.hist(df_data[var][df_data.query(query).index], color='k'      , label='data' , **kwargs);
    plt.legend()
    plt.grid()
    plt.semilogy()
    plt.xlabel('N tracks in jet');
    plt.ylabel('probability');
# plt.savefig('pT_spectrum_by_flavour.png')
In [17]:
kwargs = dict(bins=np.linspace(0,1,200), density=1, histtype='step' , lw=2)

plt.figure(figsize=(8,5))
plt.hist(y_udsg_proba_bc_vs_udsg, color='blue'   , label='udsg' , **kwargs);
plt.hist(y_c_proba_bc_vs_udsg,    color='orange' , label='c'    , **kwargs);
plt.hist(y_b_proba_bc_vs_udsg,    color='r'      , label='b'    , **kwargs);
plt.hist(y_data_proba_bc_vs_udsg, color='k'      , label='data' , **kwargs);
plt.legend(loc='lower center')
plt.grid()
plt.semilogy()
plt.xlabel('score $bc$ vs $udsg$');
plt.ylabel('counts');
# plt.savefig(f'score_bc_vs_udsg_{short_exp_id(exp_id_bc_vs_udsg)}_by_flavour.png')
In [18]:
kwargs = dict(bins=np.linspace(0,1,200), density=1, histtype='step' , lw=2)

plt.figure(figsize=(8,5))
plt.hist(y_udsg_proba_b_vs_c, color='blue'   , label='udsg' , **kwargs);
plt.hist(y_c_proba_b_vs_c,    color='orange' , label='c'    , **kwargs);
plt.hist(y_b_proba_b_vs_c,    color='r'      , label='b'    , **kwargs);
plt.hist(y_data_proba_b_vs_c, color='k'      , label='data' , **kwargs);
plt.legend(loc='lower center')
plt.grid()
plt.semilogy()
plt.xlabel('score $b$ vs $c$');
plt.ylabel('counts');
# plt.savefig(f'score_b_vs_c_{short_exp_id(exp_id_b_vs_c)}_by_flavour.png')
In [19]:
for y_proba_bc_vs_udsg, y_proba_b_vs_c, flavour in zip(
                                        [y_data_proba_bc_vs_udsg, y_udsg_proba_bc_vs_udsg, y_c_proba_bc_vs_udsg, y_b_proba_bc_vs_udsg],
                                        [y_data_proba_b_vs_c    , y_udsg_proba_b_vs_c,     y_c_proba_b_vs_c,     y_b_proba_b_vs_c    ],
                                        ['data'                 , 'udsg',                  'c',                  'b'                 ],
                                    ): 
    plt.figure(figsize=(7,5))
    plt.hist2d(y_proba_bc_vs_udsg, y_proba_b_vs_c, bins=50, norm=mpl.colors.LogNorm(), cmin=50);
    plt.colorbar()
    plt.xlabel('score bc vs udsg')
    plt.ylabel('score b vs c')
    plt.title(f'{flavour}-jets')
    plt.tight_layout()
    plt.xlim([0,1])
    plt.ylim([0,1])
#     plt.savefig(f'scores_2BDTs_{flavour}_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}.png')
/cvmfs/sft.cern.ch/lcg/views/LCG_96python3/x86_64-centos7-gcc8-opt/lib/python3.6/site-packages/matplotlib/colors.py:1062: RuntimeWarning: invalid value encountered in less_equal
  mask |= resdat <= 0

score distirbutions -- pT differential

In [32]:
pt_queries = ['Jet_Pt > 10 and Jet_Pt < 150', 'Jet_Pt > 10 and Jet_Pt < 20', 'Jet_Pt > 20 and Jet_Pt < 30', 'Jet_Pt > 30 and Jet_Pt < 40', 'Jet_Pt > 40 and Jet_Pt < 100']
In [21]:
kwargs = dict(bins=np.linspace(0,1,51), density=1, histtype='step' , lw=2)
logy = 0

for query in pt_queries:
    plt.figure(figsize=(8,5))
    plt.hist(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index], color='blue'   , label='udsg' , **kwargs);
    plt.hist(y_c_proba_bc_vs_udsg[df_c.query(query).index],    color='orange' , label='c'    , **kwargs);
    yrange = plt.ylim()
    plt.hist(y_b_proba_bc_vs_udsg[df_b.query(query).index],    color='r'      , label='b'    , **kwargs);
    plt.hist(y_data_proba_bc_vs_udsg[df_data.query(query).index], color='k'      , label='data' , **kwargs);
    if logy:
        plt.semilogy()
        plt.legend(loc='lower center')
        plt.ylim(1e-3, 1e2)
        suffix = ''
    else:
        plt.legend(loc=(0.6, 0.6))
#         plt.ylim(yrange[0], yrange[1]*1.2)
        plt.ylim(0,10)
        suffix = '_liny'
    plt.grid()
    plt.xlabel('score $bc$ vs $udsg$');
    plt.ylabel('probability');
    plt.text(0.05, 0.9, query, transform=plt.gca().transAxes)
    plt.tight_layout()
    pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
    fname = f'score_bc_vs_udsg_{short_exp_id(exp_id_bc_vs_udsg)}_by_flavour_pt{pt_range}{suffix}.png'
    print(fname)
#     plt.savefig(fname)
score_bc_vs_udsg_bcf99d_by_flavour_pt10-150_liny.png
score_bc_vs_udsg_bcf99d_by_flavour_pt10-20_liny.png
score_bc_vs_udsg_bcf99d_by_flavour_pt20-30_liny.png
score_bc_vs_udsg_bcf99d_by_flavour_pt30-40_liny.png
score_bc_vs_udsg_bcf99d_by_flavour_pt40-100_liny.png
In [22]:
kwargs = dict(bins=np.linspace(0,1,51), density=1, histtype='step' , lw=2)
logy = 0

for query in pt_queries:
    plt.figure(figsize=(8,5))
    plt.hist(y_udsg_proba_b_vs_c[df_udsg.query(query).index], color='blue'   , label='udsg' , **kwargs);
    plt.hist(y_c_proba_b_vs_c[df_c.query(query).index],    color='orange' , label='c'    , **kwargs);
    yrange = plt.ylim()
    plt.hist(y_b_proba_b_vs_c[df_b.query(query).index],    color='r'      , label='b'    , **kwargs);
    plt.hist(y_data_proba_b_vs_c[df_data.query(query).index], color='k'      , label='data' , **kwargs);
    if logy:
        plt.semilogy()
        plt.legend(loc='lower center')
        plt.ylim(1e-3,2e1)
        suffix = ''
        plt.text(0.4, 0.9, query, transform=plt.gca().transAxes)
    else:
        plt.legend(loc=(0.6, 0.6))
#         plt.ylim(yrange[0], yrange[1]*1.2)
        plt.ylim(0,10)
        suffix = '_liny'    
        plt.text(0.05, 0.9, query, transform=plt.gca().transAxes)
        
    plt.grid()
    plt.xlabel('score $b$ vs $c$');
    plt.ylabel('counts');
    plt.tight_layout()
    pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
    fname = f'score_b_vs_c_{short_exp_id(exp_id_b_vs_c)}_by_flavour_pt{pt_range}{suffix}.png'
    print(fname)
#     plt.savefig(fname)
score_b_vs_c_3ce14e_by_flavour_pt10-150_liny.png
score_b_vs_c_3ce14e_by_flavour_pt10-20_liny.png
score_b_vs_c_3ce14e_by_flavour_pt20-30_liny.png
score_b_vs_c_3ce14e_by_flavour_pt30-40_liny.png
score_b_vs_c_3ce14e_by_flavour_pt40-100_liny.png
In [23]:
# for q in ['Jet_Pt < 20', 'Jet_Pt > 20 and Jet_Pt < 30', 'Jet_Pt > 30 and Jet_Pt < 50', 'Jet_Pt > 50']:
n_bins = 15

for query in pt_queries:
    print()
    fig, axes = plt.subplots(ncols=4, figsize=(20,5))
    for y_proba_bc_vs_udsg, y_proba_b_vs_c, flavour, df, ax in zip(
                                            [y_data_proba_bc_vs_udsg, y_udsg_proba_bc_vs_udsg, y_c_proba_bc_vs_udsg, y_b_proba_bc_vs_udsg],
                                            [y_data_proba_b_vs_c    , y_udsg_proba_b_vs_c,     y_c_proba_b_vs_c,     y_b_proba_b_vs_c    ],
                                            ['data'                 , 'udsg',                  'c',                  'b'                 ],
                                            [df_data                ,  df_udsg               , df_c                , df_b                ],
                                            axes,
                                        ): 
        idx = df.query(query).index
        cmin = int(1e-4*len(idx))
        print(cmin)
        ax.hist2d(y_proba_bc_vs_udsg[idx], y_proba_b_vs_c[idx], 
                  bins=[np.linspace(0,1,n_bins+1), np.linspace(0,1,n_bins+1)], 
                  norm=mpl.colors.LogNorm(), 
                  cmin=cmin,
#                   density=1,
#                   vmin=1e-6, vmax=1e4,
                 );
    #     plt.colorbar()
        ax.set_xlabel('score bc vs udsg')
        ax.set_ylabel('score b vs c')
        pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])
        ax.set_title(f'{flavour}-jets pT {pt_range} GeV/c')
        plt.tight_layout()
        ax.set_xlim([0,1])
        ax.set_ylim([0,1])
        fname = f'scores_2BDTs_{short_exp_id(exp_id_bc_vs_udsg)}-{short_exp_id(exp_id_b_vs_c)}_pt{pt_range}.png'
#         plt.savefig(fname)
14
25
25
25

12
4
3
4

1
3
3
5

0
2
2
4

0
12
13
9

Calculate c- and b-fractions in data

Template fit to score distribution - TOY

In [29]:
bins = np.linspace(0,1,101)
bin_centers = (bins[:-1]+bins[1:])/2


kwargs_np_hist = dict(bins=bins, density=0)
counts_udsg, x = np.histogram(y_udsg_proba_bc_vs_udsg, **kwargs_np_hist)
counts_c, x    = np.histogram(y_c_proba_bc_vs_udsg   , **kwargs_np_hist)
counts_b, x    = np.histogram(y_b_proba_bc_vs_udsg   , **kwargs_np_hist)

# y_test = y_udsg_proba_bc_vs_udsg*80 + y_c_proba_bc_vs_udsg*20 #+ y_b_proba_bc_vs_udsg*5 
# counts_test, x    = np.histogram(y_test   , **kwargs_np_hist)
counts_test = counts_udsg*87 + counts_c*10 + counts_b*3
noise = np.random.randint(1,100000,len(bin_centers))-50000 #+ np.linspace(0,500000, len(bin_centers)) - 250000
counts_test += noise.astype(int)



def func(x, n_udsg, n_c, n_b):
    counts = counts_udsg*n_udsg + counts_c*n_c + counts_b*n_b
    ret = []
    for x_i in x:
        v = [c for bc, c in zip(bin_centers, counts) if abs(bc-x_i) < 1e-6][0]
        ret.append(v)
    return ret
popt, pcov = curve_fit(func, bin_centers, counts_test)




plt.figure(figsize=(8,6))
plt.plot(bin_centers, counts_udsg, label='udsg')
plt.plot(bin_centers, counts_c, label='c')
plt.plot(bin_centers, counts_b, label='b')
plt.plot(bin_centers, counts_test/100,'-', lw=2, c='k', label='artif. mix. udsg+c+b + noise')
plt.plot(bin_centers, (counts_test-noise.astype(int))/100,'--', lw=2, c='k', label='artif. mix udsg+c+b')
plt.semilogy()
plt.legend()

print(f'fractions from fit: {popt}')
fractions from fit: [86.16173043  9.39438703  3.40719083]
In [20]:
 
In [21]:
popt
Out[21]:
array([88.67612689,  7.76936183,  3.46615454])

Template fit to score ditribution

In [33]:
def make_fit(query, clf_type, nbins, logy, verbose=True):
    bins = np.linspace(0,1,nbins+1)
    bin_centers = (bins[:-1]+bins[1:])/2

    kwargs_np_hist = dict(bins=bins, density=0)
    if clf_type == 'bc_vs_udsg':
        counts_udsg, x = np.histogram(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index] , **kwargs_np_hist)
        counts_c, x    = np.histogram(y_c_proba_bc_vs_udsg[df_c.query(query).index]       , **kwargs_np_hist)
        counts_b, x    = np.histogram(y_b_proba_bc_vs_udsg[df_b.query(query).index]       , **kwargs_np_hist)
        counts_data, x = np.histogram(y_data_proba_bc_vs_udsg[df_data.query(query).index] , **kwargs_np_hist)
    elif clf_type == 'b_vs_c':
        counts_udsg, x = np.histogram(y_udsg_proba_b_vs_c[df_udsg.query(query).index] , **kwargs_np_hist)
        counts_c, x    = np.histogram(y_c_proba_b_vs_c[df_c.query(query).index]       , **kwargs_np_hist)
        counts_b, x    = np.histogram(y_b_proba_b_vs_c[df_b.query(query).index]       , **kwargs_np_hist)
        counts_data, x = np.histogram(y_data_proba_b_vs_c[df_data.query(query).index] , **kwargs_np_hist)
    pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])

    
    def func(x, n_udsg, n_c, n_b):
        counts = counts_udsg*n_udsg + counts_c*n_c + counts_b*n_b
        ret = []
        for x_i in x:
            v = [c for bc, c in zip(bin_centers, counts) if abs(bc-x_i) < 1e-6][0]
            ret.append(v)
        return ret
    
    
    sum_data = np.sum(counts_data)
    sum_udsg = np.sum(counts_udsg)
    fit_max = sum_data/sum_udsg
    
    sigmas = [np.sqrt(c) if c else 1e8 for c in counts_data]
    popt, pcov = curve_fit(func, bin_centers, counts_data, sigma=sigmas, 
                           bounds=([0.7*fit_max,0,0], [fit_max,fit_max,fit_max]))
    
    residuals = func(bin_centers, *popt) - counts_data
    chi2 = np.sum((residuals / sigmas)**2)
    ndf  = len(bin_centers) - 3
    if verbose: 
        print(f'results for pT = {pt_range},  {nbins} bins: \n\t\t\t fit_max={fit_max:.3f} raw frac. = {[round(p,4) for p in popt]}\n\t\t  \
            frac. = {[round(p/sum(popt),3) for p in popt]} uncert. = {[round(pc/sum(popt),5) for pc in pcov[[0,1,2], [0,1,2]]]}\n\t\t \
            chi2 / Ndf = {chi2:.1f}/{ndf} = {chi2/ndf:.1f}')
        
    
    fig,axes = plt.subplots(figsize=(16,4), ncols=2)
    kwargs_plt_hist = dict(bins=bins, density=1, histtype='step', lw=2)
    ax = axes[0]
    if clf_type == 'bc_vs_udsg':
        ax.hist(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index] , **kwargs_plt_hist, color='b', label='udsg')
        ax.hist(y_c_proba_bc_vs_udsg[df_c.query(query).index]       , **kwargs_plt_hist, color='orange', label='c')
        ax.hist(y_b_proba_bc_vs_udsg[df_b.query(query).index]       , **kwargs_plt_hist, color='r', label='r')
        ax.hist(y_data_proba_bc_vs_udsg[df_data.query(query).index] , **kwargs_plt_hist, color='k', label='data')
        axes[0].set_xlabel('score $bc$-vs-$udsg$')
        axes[1].set_xlabel('score $bc$-vs-$udsg$')
    elif clf_type == 'b_vs_c':
        ax.hist(y_udsg_proba_b_vs_c[df_udsg.query(query).index] , **kwargs_plt_hist, color='b', label='udsg')
        ax.hist(y_c_proba_b_vs_c[df_c.query(query).index]       , **kwargs_plt_hist, color='orange', label='c')
        ax.hist(y_b_proba_b_vs_c[df_b.query(query).index]       , **kwargs_plt_hist, color='r', label='r')
        ax.hist(y_data_proba_b_vs_c[df_data.query(query).index] , **kwargs_plt_hist, color='k', label='data')
        axes[0].set_xlabel('score $b$-vs-$c$')
        axes[1].set_xlabel('score $b$-vs-$c$')
        
    if logy:
        ax.legend(loc='lower center')
        ax.semilogy()
    else:
        ax.legend(loc='upper left')
        ax.set_ylim(0, 10)

    ax = axes[1]
    ax.bar(bin_centers, counts_udsg*popt[0], 1/len(bin_centers)                                             , label='udsg', color='b', alpha=0.5)
    ax.bar(bin_centers, counts_c*popt[1]   , 1/len(bin_centers), bottom=counts_udsg*popt[0]                 , label='c', color='orange', alpha=0.7)
    ax.bar(bin_centers, counts_b*popt[2]   , 1/len(bin_centers), bottom=counts_udsg*popt[0]+counts_c*popt[1], label='b', color='r', alpha=0.8)
    ax.errorbar(bin_centers, counts_data, yerr=np.sqrt(counts_data), fmt='o', color='k', alpha=0.8, label='data')
    if logy:
        ax.legend(loc='lower center')
        ax.semilogy()
        txt_pos = 0.98, 0.9
    else:
        ax.legend(loc='upper right')
        txt_pos = 0.98, 0.4
        
    ax.text(*txt_pos, f'chi2/Ndf = {chi2/ndf:.1f}', 
            transform=ax.transAxes, 
            fontsize=12, horizontalalignment='right',
           bbox=dict(facecolor='white', alpha=0.2)
           )        
    
    plt.suptitle(query)
    return axes
    

Plots of template fitting

below you can find plots of fitting data score distribution to templates coming from MC.
Templates are shown on the left and fitted distribution on the right.
There are plots for:

  • scores distribution of both models: bc-vs-udsg and b-vs-c
  • with linear and log y-scale
  • for 4 $p_T$ bins: 10-20, 20-30, 30-40 and 40-100 GeV/c

bc-vs-udsg (linscale)

In [34]:
for query in pt_queries[1:]:
    make_fit(query, clf_type='bc_vs_udsg', nbins=50, logy=0, verbose=0)

bc-vs-udsg (logscale)

In [244]:
for query in pt_queries[1:]:
    make_fit(query, clf_type='bc_vs_udsg', nbins=50, logy=1, verbose=0)

b-vs-c (linscale)

In [245]:
for query in pt_queries[1:]:
    make_fit(query, clf_type='b_vs_c', nbins=50, logy=0, verbose=0)

b-vs-c (logscale)

In [246]:
for query in pt_queries[1:]:
    make_fit(query, clf_type='b_vs_c', nbins=50, logy=1, verbose=0)

Uncertainty of the fit estimation by param. variation

In [212]:
def func(x, n_udsg, n_c, n_b):
    counts = counts_udsg*n_udsg + counts_c*n_c + counts_b*n_b
    ret = []
    for x_i in x:
        v = [c for bc, c in zip(bin_centers, counts) if abs(bc-x_i) < 1e-6][0]
        ret.append(v)
    return ret



# nbins = 20
logy = 0
dpt = 0.1

fit_res = dict()

for query_unvaried in pt_queries[1:]:
    print()
    lim1,lim2 = [int(x) for x in np.array(query_unvaried.split(' '))[[2,6]]]
    fit_res[(lim1,lim2)] = dict(udsg=[], c=[], b=[])
    for lim1_varied in [lim1, lim1*(1+dpt), lim1*(1-dpt)]:
        for lim2_varied in [lim2, lim2*(1+dpt), lim2*(1-dpt)]:
            for nbins in [30, 50, 100]:
                query = f'Jet_Pt > {lim1_varied:.1f} and Jet_Pt < {lim2_varied:.1f}'

                bins = np.linspace(0,1,nbins+1)
                bin_centers = (bins[:-1]+bins[1:])/2

                kwargs_np_hist = dict(bins=bins, density=0)
#                 counts_udsg, x = np.histogram(y_udsg_proba_bc_vs_udsg[df_udsg.query(query).index] , **kwargs_np_hist)
#                 counts_c, x    = np.histogram(y_c_proba_bc_vs_udsg[df_c.query(query).index]       , **kwargs_np_hist)
#                 counts_b, x    = np.histogram(y_b_proba_bc_vs_udsg[df_b.query(query).index]       , **kwargs_np_hist)
#                 counts_data, x = np.histogram(y_data_proba_bc_vs_udsg[df_data.query(query).index] , **kwargs_np_hist)

                counts_udsg, x = np.histogram(y_udsg_proba_b_vs_c[df_udsg.query(query).index] , **kwargs_np_hist)
                counts_c, x    = np.histogram(y_c_proba_b_vs_c[df_c.query(query).index]       , **kwargs_np_hist)
                counts_b, x    = np.histogram(y_b_proba_b_vs_c[df_b.query(query).index]       , **kwargs_np_hist)
                counts_data, x = np.histogram(y_data_proba_b_vs_c[df_data.query(query).index] , **kwargs_np_hist)

                pt_range = '-'.join([subs for subs in query.split(' ') if subs.isdigit()])

                sum_data = np.sum(counts_data)
                sum_udsg = np.sum(counts_udsg)
                fit_max = sum_data/sum_udsg
                
                sigmas = [np.sqrt(c) if c else 1e8 for c in counts_data]
                popt, pcov = curve_fit(func, bin_centers, counts_data, sigma=sigmas, 
                                       bounds=([fit_max*0.7, 0, 0], fit_max))

                residuals = func(bin_centers, *popt) - counts_data
                chi2 = np.sum((residuals / sigmas)**2)
                ndf  = len(bin_centers) - 3
                
                print(f'q={query}, nbins={nbins}: \tchi2/Ndf = {chi2/ndf:.1f} \t frac = {[round(p/sum(popt),3) for p in popt]}')
               
                fit_res[(lim1,lim2)]['udsg'].append(popt[0]/sum(popt))
                fit_res[(lim1,lim2)]['c'].append(popt[1]/sum(popt))
                fit_res[(lim1,lim2)]['b'].append(popt[2]/sum(popt))
#                 print(f'results for pT = {pt_range},  {nbins} bins: {[round(p,4) for p in popt]} {[round(p/sum(popt),3) for p in popt]}\t{[round(pc/sum(popt),5) for pc in pcov[[0,1,2], [0,1,2]]]}')
q=Jet_Pt > 10.0 and Jet_Pt < 20.0, nbins=30: 	chi2/Ndf = 205.0 	 frac = [0.746, 0.241, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 20.0, nbins=50: 	chi2/Ndf = 123.6 	 frac = [0.747, 0.24, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 20.0, nbins=100: 	chi2/Ndf = 62.2 	 frac = [0.748, 0.239, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 22.0, nbins=30: 	chi2/Ndf = 290.5 	 frac = [0.751, 0.236, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 22.0, nbins=50: 	chi2/Ndf = 171.6 	 frac = [0.752, 0.235, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 22.0, nbins=100: 	chi2/Ndf = 85.1 	 frac = [0.753, 0.235, 0.013]
q=Jet_Pt > 10.0 and Jet_Pt < 18.0, nbins=30: 	chi2/Ndf = 163.6 	 frac = [0.739, 0.247, 0.014]
q=Jet_Pt > 10.0 and Jet_Pt < 18.0, nbins=50: 	chi2/Ndf = 98.1 	 frac = [0.74, 0.246, 0.014]
q=Jet_Pt > 10.0 and Jet_Pt < 18.0, nbins=100: 	chi2/Ndf = 49.7 	 frac = [0.741, 0.245, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 20.0, nbins=30: 	chi2/Ndf = 120.2 	 frac = [0.754, 0.232, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 20.0, nbins=50: 	chi2/Ndf = 73.3 	 frac = [0.754, 0.231, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 20.0, nbins=100: 	chi2/Ndf = 37.6 	 frac = [0.756, 0.23, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 22.0, nbins=30: 	chi2/Ndf = 169.1 	 frac = [0.756, 0.229, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 22.0, nbins=50: 	chi2/Ndf = 100.1 	 frac = [0.757, 0.229, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 22.0, nbins=100: 	chi2/Ndf = 50.2 	 frac = [0.758, 0.228, 0.014]
q=Jet_Pt > 11.0 and Jet_Pt < 18.0, nbins=30: 	chi2/Ndf = 95.9 	 frac = [0.747, 0.237, 0.016]
q=Jet_Pt > 11.0 and Jet_Pt < 18.0, nbins=50: 	chi2/Ndf = 57.9 	 frac = [0.748, 0.236, 0.016]
q=Jet_Pt > 11.0 and Jet_Pt < 18.0, nbins=100: 	chi2/Ndf = 30.0 	 frac = [0.75, 0.235, 0.016]
q=Jet_Pt > 9.0 and Jet_Pt < 20.0, nbins=30: 	chi2/Ndf = 205.0 	 frac = [0.746, 0.241, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 20.0, nbins=50: 	chi2/Ndf = 123.6 	 frac = [0.747, 0.24, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 20.0, nbins=100: 	chi2/Ndf = 62.2 	 frac = [0.748, 0.239, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 22.0, nbins=30: 	chi2/Ndf = 290.5 	 frac = [0.751, 0.236, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 22.0, nbins=50: 	chi2/Ndf = 171.6 	 frac = [0.752, 0.235, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 22.0, nbins=100: 	chi2/Ndf = 85.1 	 frac = [0.753, 0.235, 0.013]
q=Jet_Pt > 9.0 and Jet_Pt < 18.0, nbins=30: 	chi2/Ndf = 163.6 	 frac = [0.739, 0.247, 0.014]
q=Jet_Pt > 9.0 and Jet_Pt < 18.0, nbins=50: 	chi2/Ndf = 98.1 	 frac = [0.74, 0.246, 0.014]
q=Jet_Pt > 9.0 and Jet_Pt < 18.0, nbins=100: 	chi2/Ndf = 49.7 	 frac = [0.741, 0.245, 0.014]

q=Jet_Pt > 20.0 and Jet_Pt < 30.0, nbins=30: 	chi2/Ndf = 11.5 	 frac = [0.789, 0.191, 0.02]
q=Jet_Pt > 20.0 and Jet_Pt < 30.0, nbins=50: 	chi2/Ndf = 7.0 	 frac = [0.792, 0.188, 0.02]
q=Jet_Pt > 20.0 and Jet_Pt < 30.0, nbins=100: 	chi2/Ndf = 4.5 	 frac = [0.796, 0.187, 0.018]
q=Jet_Pt > 20.0 and Jet_Pt < 33.0, nbins=30: 	chi2/Ndf = 18.5 	 frac = [0.79, 0.19, 0.02]
q=Jet_Pt > 20.0 and Jet_Pt < 33.0, nbins=50: 	chi2/Ndf = 10.6 	 frac = [0.793, 0.188, 0.019]
q=Jet_Pt > 20.0 and Jet_Pt < 33.0, nbins=100: 	chi2/Ndf = 6.4 	 frac = [0.797, 0.186, 0.017]
q=Jet_Pt > 20.0 and Jet_Pt < 27.0, nbins=30: 	chi2/Ndf = 7.8 	 frac = [0.788, 0.192, 0.02]
q=Jet_Pt > 20.0 and Jet_Pt < 27.0, nbins=50: 	chi2/Ndf = 5.2 	 frac = [0.791, 0.189, 0.02]
q=Jet_Pt > 20.0 and Jet_Pt < 27.0, nbins=100: 	chi2/Ndf = 3.6 	 frac = [0.795, 0.188, 0.017]
q=Jet_Pt > 22.0 and Jet_Pt < 30.0, nbins=30: 	chi2/Ndf = 5.8 	 frac = [0.795, 0.187, 0.018]
q=Jet_Pt > 22.0 and Jet_Pt < 30.0, nbins=50: 	chi2/Ndf = 3.6 	 frac = [0.798, 0.184, 0.018]
q=Jet_Pt > 22.0 and Jet_Pt < 30.0, nbins=100: 	chi2/Ndf = 2.9 	 frac = [0.808, 0.177, 0.015]
q=Jet_Pt > 22.0 and Jet_Pt < 33.0, nbins=30: 	chi2/Ndf = 9.6 	 frac = [0.794, 0.187, 0.018]
q=Jet_Pt > 22.0 and Jet_Pt < 33.0, nbins=50: 	chi2/Ndf = 5.6 	 frac = [0.797, 0.185, 0.018]
q=Jet_Pt > 22.0 and Jet_Pt < 33.0, nbins=100: 	chi2/Ndf = 4.1 	 frac = [0.808, 0.177, 0.015]
q=Jet_Pt > 22.0 and Jet_Pt < 27.0, nbins=30: 	chi2/Ndf = 4.7 	 frac = [0.795, 0.187, 0.018]
q=Jet_Pt > 22.0 and Jet_Pt < 27.0, nbins=50: 	chi2/Ndf = 2.9 	 frac = [0.798, 0.185, 0.017]
q=Jet_Pt > 22.0 and Jet_Pt < 27.0, nbins=100: 	chi2/Ndf = 2.3 	 frac = [0.809, 0.177, 0.013]
q=Jet_Pt > 18.0 and Jet_Pt < 30.0, nbins=30: 	chi2/Ndf = 22.4 	 frac = [0.786, 0.197, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 30.0, nbins=50: 	chi2/Ndf = 12.8 	 frac = [0.787, 0.195, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 30.0, nbins=100: 	chi2/Ndf = 7.2 	 frac = [0.79, 0.193, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 33.0, nbins=30: 	chi2/Ndf = 34.4 	 frac = [0.787, 0.194, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 33.0, nbins=50: 	chi2/Ndf = 19.1 	 frac = [0.789, 0.193, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 33.0, nbins=100: 	chi2/Ndf = 10.4 	 frac = [0.791, 0.191, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 27.0, nbins=30: 	chi2/Ndf = 15.8 	 frac = [0.784, 0.199, 0.017]
q=Jet_Pt > 18.0 and Jet_Pt < 27.0, nbins=50: 	chi2/Ndf = 9.4 	 frac = [0.786, 0.196, 0.018]
q=Jet_Pt > 18.0 and Jet_Pt < 27.0, nbins=100: 	chi2/Ndf = 5.6 	 frac = [0.789, 0.194, 0.017]

q=Jet_Pt > 30.0 and Jet_Pt < 40.0, nbins=30: 	chi2/Ndf = 3.6 	 frac = [0.823, 0.164, 0.013]
q=Jet_Pt > 30.0 and Jet_Pt < 40.0, nbins=50: 	chi2/Ndf = 2.5 	 frac = [0.828, 0.162, 0.01]
q=Jet_Pt > 30.0 and Jet_Pt < 40.0, nbins=100: 	chi2/Ndf = 1.7 	 frac = [0.845, 0.142, 0.013]
q=Jet_Pt > 30.0 and Jet_Pt < 44.0, nbins=30: 	chi2/Ndf = 34.4 	 frac = [0.847, 0.123, 0.03]
q=Jet_Pt > 30.0 and Jet_Pt < 44.0, nbins=50: 	chi2/Ndf = 20.4 	 frac = [0.852, 0.122, 0.026]
q=Jet_Pt > 30.0 and Jet_Pt < 44.0, nbins=100: 	chi2/Ndf = 10.9 	 frac = [0.876, 0.087, 0.037]
q=Jet_Pt > 30.0 and Jet_Pt < 36.0, nbins=30: 	chi2/Ndf = 2.3 	 frac = [0.825, 0.151, 0.024]
q=Jet_Pt > 30.0 and Jet_Pt < 36.0, nbins=50: 	chi2/Ndf = 1.6 	 frac = [0.835, 0.145, 0.02]
q=Jet_Pt > 30.0 and Jet_Pt < 36.0, nbins=100: 	chi2/Ndf = 1.5 	 frac = [0.845, 0.142, 0.013]
q=Jet_Pt > 33.0 and Jet_Pt < 40.0, nbins=30: 	chi2/Ndf = 3.3 	 frac = [0.868, 0.117, 0.014]
q=Jet_Pt > 33.0 and Jet_Pt < 40.0, nbins=50: 	chi2/Ndf = 2.6 	 frac = [0.899, 0.087, 0.014]
q=Jet_Pt > 33.0 and Jet_Pt < 40.0, nbins=100: 	chi2/Ndf = 1.7 	 frac = [0.903, 0.082, 0.016]
q=Jet_Pt > 33.0 and Jet_Pt < 44.0, nbins=30: 	chi2/Ndf = 16.3 	 frac = [0.839, 0.128, 0.033]
q=Jet_Pt > 33.0 and Jet_Pt < 44.0, nbins=50: 	chi2/Ndf = 9.8 	 frac = [0.847, 0.122, 0.032]
q=Jet_Pt > 33.0 and Jet_Pt < 44.0, nbins=100: 	chi2/Ndf = 5.5 	 frac = [0.871, 0.085, 0.044]
q=Jet_Pt > 33.0 and Jet_Pt < 36.0, nbins=30: 	chi2/Ndf = 2.2 	 frac = [0.928, 0.059, 0.014]
q=Jet_Pt > 33.0 and Jet_Pt < 36.0, nbins=50: 	chi2/Ndf = 1.8 	 frac = [0.915, 0.042, 0.043]
q=Jet_Pt > 33.0 and Jet_Pt < 36.0, nbins=100: 	chi2/Ndf = 1.3 	 frac = [0.938, 0.019, 0.043]
q=Jet_Pt > 27.0 and Jet_Pt < 40.0, nbins=30: 	chi2/Ndf = 5.1 	 frac = [0.812, 0.17, 0.019]
q=Jet_Pt > 27.0 and Jet_Pt < 40.0, nbins=50: 	chi2/Ndf = 3.4 	 frac = [0.817, 0.166, 0.017]
q=Jet_Pt > 27.0 and Jet_Pt < 40.0, nbins=100: 	chi2/Ndf = 2.6 	 frac = [0.835, 0.15, 0.015]
q=Jet_Pt > 27.0 and Jet_Pt < 44.0, nbins=30: 	chi2/Ndf = 71.4 	 frac = [0.881, 0.075, 0.045]
q=Jet_Pt > 27.0 and Jet_Pt < 44.0, nbins=50: 	chi2/Ndf = 41.9 	 frac = [0.886, 0.071, 0.043]
q=Jet_Pt > 27.0 and Jet_Pt < 44.0, nbins=100: 	chi2/Ndf = 22.6 	 frac = [0.913, 0.045, 0.041]
q=Jet_Pt > 27.0 and Jet_Pt < 36.0, nbins=30: 	chi2/Ndf = 3.7 	 frac = [0.811, 0.173, 0.016]
q=Jet_Pt > 27.0 and Jet_Pt < 36.0, nbins=50: 	chi2/Ndf = 2.6 	 frac = [0.817, 0.167, 0.015]
q=Jet_Pt > 27.0 and Jet_Pt < 36.0, nbins=100: 	chi2/Ndf = 2.3 	 frac = [0.848, 0.14, 0.012]

q=Jet_Pt > 40.0 and Jet_Pt < 100.0, nbins=30: 	chi2/Ndf = 4.2 	 frac = [0.854, 0.104, 0.042]
q=Jet_Pt > 40.0 and Jet_Pt < 100.0, nbins=50: 	chi2/Ndf = 2.6 	 frac = [0.853, 0.094, 0.053]
q=Jet_Pt > 40.0 and Jet_Pt < 100.0, nbins=100: 	chi2/Ndf = 1.3 	 frac = [0.809, 0.138, 0.053]
q=Jet_Pt > 40.0 and Jet_Pt < 110.0, nbins=30: 	chi2/Ndf = 4.5 	 frac = [0.852, 0.105, 0.043]
q=Jet_Pt > 40.0 and Jet_Pt < 110.0, nbins=50: 	chi2/Ndf = 2.8 	 frac = [0.852, 0.094, 0.054]
q=Jet_Pt > 40.0 and Jet_Pt < 110.0, nbins=100: 	chi2/Ndf = 1.4 	 frac = [0.808, 0.138, 0.053]
q=Jet_Pt > 40.0 and Jet_Pt < 90.0, nbins=30: 	chi2/Ndf = 3.7 	 frac = [0.855, 0.104, 0.042]
q=Jet_Pt > 40.0 and Jet_Pt < 90.0, nbins=50: 	chi2/Ndf = 2.3 	 frac = [0.854, 0.093, 0.053]
q=Jet_Pt > 40.0 and Jet_Pt < 90.0, nbins=100: 	chi2/Ndf = 1.2 	 frac = [0.809, 0.138, 0.053]
q=Jet_Pt > 44.0 and Jet_Pt < 100.0, nbins=30: 	chi2/Ndf = 3.1 	 frac = [0.828, 0.133, 0.04]
q=Jet_Pt > 44.0 and Jet_Pt < 100.0, nbins=50: 	chi2/Ndf = 2.0 	 frac = [0.833, 0.13, 0.038]
q=Jet_Pt > 44.0 and Jet_Pt < 100.0, nbins=100: 	chi2/Ndf = 1.3 	 frac = [0.812, 0.1, 0.088]
q=Jet_Pt > 44.0 and Jet_Pt < 110.0, nbins=30: 	chi2/Ndf = 3.3 	 frac = [0.826, 0.135, 0.04]
q=Jet_Pt > 44.0 and Jet_Pt < 110.0, nbins=50: 	chi2/Ndf = 2.2 	 frac = [0.832, 0.13, 0.038]
q=Jet_Pt > 44.0 and Jet_Pt < 110.0, nbins=100: 	chi2/Ndf = 1.4 	 frac = [0.809, 0.102, 0.089]
q=Jet_Pt > 44.0 and Jet_Pt < 90.0, nbins=30: 	chi2/Ndf = 2.8 	 frac = [0.829, 0.131, 0.039]
q=Jet_Pt > 44.0 and Jet_Pt < 90.0, nbins=50: 	chi2/Ndf = 1.8 	 frac = [0.835, 0.127, 0.038]
q=Jet_Pt > 44.0 and Jet_Pt < 90.0, nbins=100: 	chi2/Ndf = 1.2 	 frac = [0.813, 0.099, 0.088]
q=Jet_Pt > 36.0 and Jet_Pt < 100.0, nbins=30: 	chi2/Ndf = 14.3 	 frac = [0.845, 0.095, 0.06]
q=Jet_Pt > 36.0 and Jet_Pt < 100.0, nbins=50: 	chi2/Ndf = 8.8 	 frac = [0.865, 0.074, 0.061]
q=Jet_Pt > 36.0 and Jet_Pt < 100.0, nbins=100: 	chi2/Ndf = 4.5 	 frac = [0.845, 0.099, 0.057]
q=Jet_Pt > 36.0 and Jet_Pt < 110.0, nbins=30: 	chi2/Ndf = 15.3 	 frac = [0.846, 0.092, 0.062]
q=Jet_Pt > 36.0 and Jet_Pt < 110.0, nbins=50: 	chi2/Ndf = 9.4 	 frac = [0.867, 0.07, 0.063]
q=Jet_Pt > 36.0 and Jet_Pt < 110.0, nbins=100: 	chi2/Ndf = 4.8 	 frac = [0.847, 0.095, 0.058]
q=Jet_Pt > 36.0 and Jet_Pt < 90.0, nbins=30: 	chi2/Ndf = 12.9 	 frac = [0.843, 0.099, 0.058]
q=Jet_Pt > 36.0 and Jet_Pt < 90.0, nbins=50: 	chi2/Ndf = 8.0 	 frac = [0.863, 0.077, 0.059]
q=Jet_Pt > 36.0 and Jet_Pt < 90.0, nbins=100: 	chi2/Ndf = 4.1 	 frac = [0.842, 0.104, 0.055]
In [211]:
x1,x2 = [],[]
x = []
xerr = []
y = []
yerr = []
for k,v in fit_res.items():
    print(k)
    x1.append(k[0])
    x2.append(k[1])
    x.append((k[0]+k[1])/2)
    xerr.append((k[1]-k[0])/2)
    r = v['c']
    y.append(np.median(r))
    yerr.append(np.std(r))
#     yerr.append(np.percentile(r,75) - np.percentile(r,25))
    
plt.figure(figsize=(6,5))
plt.errorbar(x,y,yerr=yerr,xerr=xerr, fmt='o');
plt.ylim(bottom=0)
plt.grid()
plt.xlabel('$p_T^{\,jet}$ [GeV/c]');
plt.ylabel('$c$-fraction')
plt.title('based on template fit to $b$-vs-$c$ score distr.', fontsize=12)
plt.tight_layout()
# plt.savefig('c-frac_template-fit_b-vs-c.png')
(10, 20)
(20, 30)
(30, 40)
(40, 100)
In [158]:
fit_res[(30,40)]['b']
Out[158]:
[0.010982703552412015,
 0.03031037100220878,
 0.010664275394406815,
 0.015066153466698592,
 0.03782409329717273,
 0.04248267628406001,
 0.012062322474706578,
 0.03398082278783701,
 0.01073674833724234]
In [213]:
plt.hist(fit_res[(30,40)]['c'], bins=10);