import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import uproot
from functools import partial
froot = uproot.open('PWGHF/HFCJ_pp_MC/660_20190221-1807/AnalysisResults_10files.root')
froot.keys()
tree_name_core = 'JetTree_AliAnalysisTaskJetExtractor_JetPY_AKTChargedR040_tracks_pT0150_E_scheme_'
df_b = froot[tree_name_core+'bJets'].pandas.df(flatten=False)
df_c = froot[tree_name_core+'cJets'].pandas.df(flatten=False)
df_l = froot[tree_name_core+'udsgJets'].pandas.df(flatten=False)
print(f'Number of jets in sample:\nb\t{len(df_b)}\nc\t{len(df_c)}\nudsg\t{len(df_l)}')
df_l.columns
n_bins = 25
color_b = 'r'
color_c = 'orange'
color_l = 'b'
density = True
def plot_histo(vals_b, vals_c, vals_l, title, n_bins=n_bins):
vals_all = np.hstack([vals_b, vals_c, vals_l])
if np.std(vals_all) < 1e-6:
print('-- skipping due to 0 variance')
return
bins = np.linspace(min(vals_all), max(vals_all), n_bins)
fig,axes = plt.subplots(ncols=2, figsize=(14,5))
for vals, color, label in [(vals_l, color_l, 'udsg'), (vals_c, color_c, 'c'), (vals_b, color_b, 'b')]:
axes[0].hist(vals, bins=bins, histtype='step', color=color, label=label, density=density, lw=2)
axes[0].legend()
axes[1].hist(vals, bins=bins, histtype='step', color=color, label=label, density=density, lw=2)
axes[1].legend()
axes[1].set_yscale('log')
fig.suptitle(title, fontsize=14)
return axes
for col in df_l.columns:
if isinstance(df_l[col][0], np.ndarray): continue
print(col)
vals_b = df_b[col]
vals_c = df_c[col]
vals_l = df_l[col]
plot_histo(vals_b, vals_c, vals_l, col)
plt.show()
for col in df_l.columns:
if not isinstance(df_l[col][0], np.ndarray): continue
print(col)
vals_b = np.hstack(df_b[col])
vals_c = np.hstack(df_c[col])
vals_l = np.hstack(df_l[col])
plot_histo(vals_b, vals_c, vals_l, col)
plt.show()
def nth_max_val(row, col, n, absv=False, verbose=False):
'''
for columns which elements are arrays
returns n-th max value of given column for each row
'''
if absv:
dummy = 0
if verbose:
print(f'col={col}, n={n}, absv={absv}')
l = sorted(list(abs(row[col]))+n*[dummy])
print(l, '-->', l[-n])
return sorted(list(abs(row[col]))+n*[dummy])[-n]
else:
dummy = -9
if verbose:
print(f'col={col}, n={n}, absv={absv}')
l = sorted(list((row[col]))+n*[dummy])
print(l, '-->', l[-n])
return sorted(list( row[col]) +n*[dummy])[-n]
max_SV_pull = ('max_SV_pull', lambda row: max([Lxy/sigmaLxy for Lxy,sigmaLxy in zip(row['Jet_SecVtx_Lxy'],row['Jet_SecVtx_SigmaLxy'])]+[0]))
max_IPd = ('max_IPd', lambda row: max(abs(row['Jet_Track_IPd'])+[0]))
max_IPz = ('max_IPz', lambda row: max(abs(row['Jet_Track_IPz'])+[0]))
leading_track_pt_frac = ('leading_track_pt_frac', lambda row: nth_max_val(row, 'Jet_Track_Pt', 1, absv=True) / row['Jet_Pt'])
leading_2track_pt_frac = ('leading_2track_pt_frac', lambda row: (nth_max_val(row, 'Jet_Track_Pt', 1, absv=True)+nth_max_val(row, 'Jet_Track_Pt', 2, absv=True)) / row['Jet_Pt'])
features = [max_SV_pull, leading_track_pt_frac, leading_2track_pt_frac, max_IPd, max_IPz]
# a'la Track Counting algorith:
for absv in [True, False]:
for col in ['Jet_Track_IPd', 'Jet_Track_IPz']:
for n in [1,2,3]:
absv_str = 'abs' if absv else 'raw'
feat_name = f'max_{n}th_{col[-3:]}_{absv_str}'
feat_fun = partial(nth_max_val, col=col, n=n, absv=absv)
### default values in lambda required due to `closures`, binding to local
### variables which are overwritten before exec. of a func, see (and links therein):
### https://stackoverflow.com/questions/32595586/in-python-why-do-lambdas-in-list-comprehensions-overwrite-themselves-in-retrosp
# feat_fun = lambda row,col=col,n=n,absv=absv: nth_max_val(row, col, n, absv=absv, verbose=False)
features.append((feat_name, feat_fun))
for name, feat_fun in features:
print(name)
vals_b = df_b.apply(feat_fun, axis=1)
vals_c = df_c.apply(feat_fun, axis=1)
vals_l = df_l.apply(feat_fun, axis=1)
plot_histo(vals_b, vals_c, vals_l, name)
plt.show()