using DataFrames # Requires > 0.22.0 for rownums function
using CSV
using Statistics
using LinearAlgebra
using Plots
using StatsBase
using Distributions
using MLJ
using MLJLinearModels
using MLJXGBoostInterface
using XGBoost
import MLJBase: train_test_pairs

# Using for Logistic + CV options, also as an example of how to use Sklearn within Julia
using ScikitLearn

@sk_import linear_model: (LogisticRegression, LinearRegression)
@sk_import model_selection: (TimeSeriesSplit, KFold, GroupKFold, cross_val_score)
@sk_import metrics: make_scorer
┌ Warning: Module model_selection has been ported to Julia - try `import ScikitLearn: CrossValidation` instead
└ @ ScikitLearn.Skcore C:\Users\Justin\.julia\packages\ScikitLearn\ssekP\src\Skcore.jl:179
PyObject <function make_scorer at 0x000000008F06BA60>
df = CSV.File("numerai_training_data.csv") |> DataFrame

first(df,5)

5 rows × 314 columns (omitted printing of 309 columns)

ideradata_typefeature_intelligence1feature_intelligence2
StringStringStringFloat64Float64
1n000315175b67977era1train0.00.5
2n0014af834a96cddera1train0.00.0
3n001c93979ac41d4era1train0.250.5
4n0034e4143f22a13era1train1.00.0
5n00679d1a636062fera1train0.250.25
size(df)
(501808, 314)
features = select(df, r"feature") |> names
df.erano = parse.(Int64, replace.(df.era, "era" => ""))
eras = df.erano
target = "target"
length(features)
310
feature_groups =
    Dict(g => [c for c in features if startswith(c, "feature_$g")] 
        for g in ["intelligence", "wisdom", "charisma", "dexterity", "strength", "constitution"])
Dict{String, Vector{String}} with 6 entries:
  "charisma"     => ["feature_charisma1", "feature_charisma2", "feature_charism…
  "constitution" => ["feature_constitution1", "feature_constitution2", "feature…
  "dexterity"    => ["feature_dexterity1", "feature_dexterity2", "feature_dexte…
  "wisdom"       => ["feature_wisdom1", "feature_wisdom2", "feature_wisdom3", "…
  "strength"     => ["feature_strength1", "feature_strength2", "feature_strengt…
  "intelligence" => ["feature_intelligence1", "feature_intelligence2", "feature…
# There's probably (definitely) a better way to write this - [ordinalrank would solve the ranking]

function numerai_score(y_true, y_pred, df)
        rank_pred = sort(combine(groupby(DataFrame(y_pred = y_pred
                                , eras = df.erano
                                , rnum = rownumber.(eachrow(df)))
                        , :eras)
                , sdf -> sort(sdf, :y_pred)
                , :eras => eachindex => :rank
                , nrow => :n)
            , :rnum)

        rank_pred = rank_pred.rank ./ rank_pred.n
    
        cor(y_true, rank_pred)
    end

# It can also be convenient while working to evaluate based on the regular (pearson) correlation
# R2 Score to replicate the Python library outputs

function r2_score(y_true, y_pred)
    @assert length(y_true) == length(y_pred)
    ss_res = sum((y_true .- y_pred).^2)
    mean = sum(y_true) / length(y_true)
    ss_total = sum((y_true .- mean).^2)
    return 1 - ss_res/(ss_total + eps(eltype(y_pred)))
end

# cor() returns a matrix with no need for manipulation, so no need to replicate that here
r2_score (generic function with 1 method)
describe(df, :all, cols=:erano)

1 rows × 13 columns (omitted printing of 3 columns)

variablemeanstdminq25medianq75maxnuniquenmissing
SymbolFloat64Float64Int64Float64Float64Float64Int64NothingInt64
1erano64.00233.3329137.064.093.01200
group_df = combine(groupby(df, :erano), nrow => :count)
plot(group_df.erano, group_df.count)
WARNING: CPU random generator seem to be failing, disabling hardware random number generation
WARNING: RDRND generated: 0xffffffff 0xffffffff 0xffffffff 0xffffffff
combine(groupby(df, :target), nrow => :count)

5 rows × 2 columns

targetcount
Float64Int64
10.5251677
20.25100053
30.75100045
40.025016
51.025017

Some of the features are very correlated

Especially within feature groups

feature_corrs = DataFrame(cor(Matrix(df[!, names(df, features)])), features)
insertcols!(feature_corrs, 1, :features => features)

first(feature_corrs,5)

5 rows × 311 columns (omitted printing of 307 columns)

featuresfeature_intelligence1feature_intelligence2feature_intelligence3
StringFloat64Float64Float64
1feature_intelligence11.0-0.0141565-0.0244041
2feature_intelligence2-0.01415651.00.905315
3feature_intelligence3-0.02440410.9053151.0
4feature_intelligence40.652596-0.0280969-0.0410859
5feature_intelligence50.06986830.1843720.17387
first(stack(feature_corrs), 5)

5 rows × 3 columns

featuresvariablevalue
StringStringFloat64
1feature_intelligence1feature_intelligence11.0
2feature_intelligence2feature_intelligence1-0.0141565
3feature_intelligence3feature_intelligence1-0.0244041
4feature_intelligence4feature_intelligence10.652596
5feature_intelligence5feature_intelligence10.0698683
tdf = stack(feature_corrs)
tdf = tdf[coalesce.(tdf.variable .< tdf.features, false), :]

sort!(tdf, :value)
vcat(first(tdf, 5), last(tdf, 5))

10 rows × 3 columns

featuresvariablevalue
StringStringFloat64
1feature_constitution9feature_constitution112-0.855008
2feature_constitution46feature_constitution33-0.83031
3feature_constitution60feature_constitution112-0.820694
4feature_constitution87feature_constitution46-0.815888
5feature_constitution33feature_constitution112-0.759084
6feature_constitution7feature_constitution270.94892
7feature_constitution79feature_constitution130.949139
8feature_wisdom39feature_wisdom310.954984
9feature_wisdom7feature_wisdom460.963706
10feature_wisdom2feature_wisdom120.968062

The correlation can change over time

You can see this by comparing feature correlations on the first half and second half on the training set

df₁ = df[coalesce.(eras .<= median(eras), false), :]
df₂ = df[coalesce.(eras .> median(eras), false), :]

corr₁ = DataFrame(cor(Matrix(df₁[!, names(df₁, features)])), features)
insertcols!(corr₁, 1, :features => features)
corr₁ = stack(corr₁)
corr₁ = corr₁[coalesce.(corr₁.variable .< corr₁.features, false), :]

corr₂ = DataFrame(cor(Matrix(df₂[!, names(df₂, features)])), features)
insertcols!(corr₂, 1, :features => features)
corr₂ = stack(corr₂)
corr₂ = corr₂[coalesce.(corr₂.variable .< corr₂.features, false), :]

tdf = leftjoin(corr₁, corr₂, on = [:variable, :features], makeunique=true)
rename!(tdf, [:value, :value_1] .=> [:corr₁, :corr₂])
tdf.corr_diff = tdf.corr₂ - tdf.corr₁
sort!(tdf, :corr_diff)

vcat(first(tdf,5), last(tdf,5))

10 rows × 5 columns

featuresvariablecorr₁corr₂corr_diff
StringStringFloat64Float64?Float64
1feature_intelligence9feature_intelligence110.0913519-0.128851-0.220203
2feature_intelligence10feature_dexterity120.5489310.343117-0.205814
3feature_intelligence11feature_dexterity90.0787148-0.12707-0.205785
4feature_dexterity12feature_dexterity10.6535280.447942-0.205587
5feature_intelligence11feature_intelligence100.0750222-0.130511-0.205534
6feature_wisdom22feature_intelligence8-0.08834610.1177720.206119
7feature_wisdom43feature_intelligence4-0.1024380.1037580.206197
8feature_wisdom33feature_intelligence4-0.07892960.1336640.212593
9feature_wisdom43feature_intelligence8-0.1213060.1151940.236501
10feature_wisdom33feature_intelligence8-0.09175930.1505490.242308

Some features are predictive on their own

feature_scores = 
    Dict(feature => numerai_score(df.target, df[!, feature], df) 
    for feature in features);
sort(collect(feature_scores), by=x->x[2])
310-element Vector{Pair{String, Float64}}:
      "feature_dexterity7" => -0.011504914975281387
      "feature_dexterity6" => -0.011161569760516648
      "feature_dexterity4" => -0.011051275935746707
      "feature_charisma69" => -0.010221311263804505
     "feature_dexterity11" => -0.010198611285978189
       "feature_charisma9" => -0.01005025481510148
     "feature_dexterity12" => -0.008647990681418269
     "feature_dexterity14" => -0.008611934226827449
      "feature_dexterity3" => -0.007607475423078082
  "feature_constitution91" => -0.007343474478571397
      "feature_dexterity8" => -0.0072798010318430835
  "feature_constitution56" => -0.007206474137472462
 "feature_constitution110" => -0.007154545491821277
                           ⋮
       "feature_strength4" => 0.010184232051437234
      "feature_charisma81" => 0.010224703123621929
      "feature_charisma46" => 0.01039946229179084
      "feature_charisma66" => 0.010507207995266913
      "feature_charisma54" => 0.010507614931830764
       "feature_charisma6" => 0.010580151207446348
      "feature_charisma76" => 0.010608014161745269
      "feature_charisma18" => 0.010697616377184683
      "feature_charisma19" => 0.01071636820945142
      "feature_charisma37" => 0.01089177370157534
      "feature_strength14" => 0.01160884774563975
      "feature_strength34" => 0.012487781133717898
by_era_correlation = 
    sort(Dict(values(erano)[1] => cor(tdf.target, tdf.feature_strength34)
         for (erano, tdf) in pairs(groupby(df, :erano))))
    
plot(by_era_correlation)
function rolling_mean(arr, n)
    rs = cumsum(arr)[n:end] .- cumsum([0.0; arr])[1:end-n]
    return rs ./ n
end

n_window = 10

plot(Dict(zip(collect(n_window-1:length(by_era_correlation)), 
        rolling_mean(collect(values(by_era_correlation)),n_window))))

Gotcha: MSE looks worse than correlation out of sample

Models will generally be overconfident, so even if they are good at ranking rows, the Mean-Squared-Error of the residuals could be larger than event the Mean-Squared-Error of the target (r-squared<0)

df₁ = df[coalesce.(eras .<= median(eras), false), :]
df₂ = df[coalesce.(eras .> median(eras), false), :];
Linear = @load LinearRegressor pkg=MLJLinearModels verbosity=0
linear = Linear()

lin₁ = machine(linear, df₁[!, names(df₁, features)], df₁.target)
fit!(lin₁, verbosity=0)

lin₂ = machine(linear, df₂[!, names(df₂, features)], df₂.target)
fit!(lin₂, verbosity=0);
r2₁ = [
    r2_score(dfₓ.target, MLJ.predict(model, dfₓ[!, names(dfₓ, features)]))
        for dfₓ in [df1, df2]
    for model in [lin1, lin2]]

DataFrame(reshape(r2₁, 2, 2), ["eval_on_1","eval_on_2"])

2 rows × 2 columns

eval_on_1eval_on_2
Float64Float64
10.00409275-0.000543157
20.0005746220.00315522
corrs = [
    numerai_score(MLJ.predict(model, dfₓ[!, names(dfₓ, features)]), dfₓ.target, dfₓ)
        for dfₓ in [df₁, df₂]
    for model in [lin₁, lin₂]]

DataFrame(reshape(corrs, 2, 2), ["eval_on_1","eval_on_2"])

2 rows × 2 columns

eval_on_1eval_on_2
Float64Float64
10.0582820.0287217
20.03194120.0528229

XGB = @load XGBoostRegressor pkg=XGBoost verbosity=0
xgb = XGB()

xgb₁ = machine(xgb, df₁[!, names(df₁, features)], df₁.target)
fit!(xgb₁, verbosity=0)

xgb₂ = machine(xgb, df₂[!, names(df₂, features)], df₂.target)
fit!(xgb₂, verbosity=0);
r2₂ = [
    r2_score(dfₓ.target, MLJ.predict(model, dfₓ[!, names(dfₓ, features)]))
        for dfₓ in [df₁, df₂]
    for model in [xgb₁, xgb₂]]

DataFrame(reshape(r2₂, 2, 2), ["eval_on_1","eval_on_2"])

2 rows × 2 columns

eval_on_1eval_on_2
Float64Float64
10.123117-0.0237936
2-0.01999590.12788
corrs2 = [
    numerai_score(MLJ.predict(model, dfₓ[!, names(dfₓ, features)]), dfₓ.target, dfₓ)
        for dfₓ in [df₁, df₂]
    for model in [xgb₁, xgb₂]]

DataFrame(reshape(corrs2, 2, 2), ["eval_on_1","eval_on_2"])

2 rows × 2 columns

eval_on_1eval_on_2
Float64Float64
10.3838740.0240443
20.02293650.392287

Gotcha: {0, 1} are noticeably different from {0.25, 0.75}

This makes training a classifier one-versus-rest behave counterintuitively.

Specifically, the 0-vs-rest and 1-vs-rest classifiers seem to learn how to pick out extreme targets, and their predictions are the most correlated

logistic = LogisticRegression()
ScikitLearn.fit!(logistic, Matrix(df[!, names(df, features)]), convert.(Int, df.target*4))
ScikitLearn.score(logistic, Matrix(df[!, names(df, features)]), convert.(Int, df.target*4))
C:\Users\Justin\.julia\conda\3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
0.5012315467270351
log_corrs = cor(transpose(ScikitLearn.predict_proba(logistic, Matrix(df[!, names(df, features)]))), dims=2)
display(log_corrs)

heatmap(log_corrs,  c=palette(:RdYlGn))
5×5 Matrix{Float64}:
  1.0        0.468155  -0.903881   0.42197    0.947252
  0.468155   1.0       -0.704718   0.517207   0.428423
 -0.903881  -0.704718   1.0       -0.71843   -0.914418
  0.42197    0.517207  -0.71843    1.0        0.498854
  0.947252   0.428423  -0.914418   0.498854   1.0
prob_matrix = ScikitLearn.predict_proba(logistic, Matrix(df[!, names(df, features)]))
classes = logistic.classes_
numerai_score(df.target, prob_matrix * classes, df)
0.050658929537343786
linear = LinearRegression()
ScikitLearn.fit!(linear, Matrix(df[!, names(df, features)]), df.target)
ScikitLearn.score(linear, Matrix(df[!, names(df, features)]), df.target)
preds = ScikitLearn.predict(linear, Matrix(df[!, names(df, features)]))
numerai_score(df.target, preds, df)
0.05107803901831943

Gotcha: eras are homogenous, but different from each other

Random cross-validation will look much better than cross-validating by era

Even for a simple linear model, taking a random shuffle reports a correlation of 4.3%, but a time series split reports a lower score of 3.4%

#ScikitLearn.fit!(linear, Matrix(df[!, names(df, features)]), df.target)
PyObject LinearRegression()
crossvalidators = [KFold(5), KFold(5, shuffle = true), GroupKFold(5), TimeSeriesSplit(5)]

for cv in crossvalidators
    println(cv)
    println(
        mean(
            cross_val_score(estimator = LinearRegression(),
                X = Matrix(df[!, names(df, features)]),
                y = df.target,
                cv = cv,
                groups = eras,
                scoring = make_scorer(cor, greater_is_better = true)
            )
        )
    )
end
PyObject KFold(n_splits=5, random_state=None, shuffle=False)
0.03332624500455265
PyObject KFold(n_splits=5, random_state=None, shuffle=True)
0.03955268244942284
PyObject GroupKFold(n_splits=5)
0.03475937229926111
PyObject TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
0.030947709608331396

Eras can be more or less applicable to other eras

You can test this be splitting the eras into blocks of 10, training on each block, and evaluating on each other block.

eras10 = (eras  10) * 10
countmap(eras10)
Dict{Int64, Int64} with 13 entries:
  20  => 37444
  110 => 45070
  60  => 46831
  30  => 41101
  0   => 24515
  80  => 43971
  90  => 45609
  40  => 43439
  70  => 40403
  50  => 48186
  10  => 34600
  120 => 4532
  100 => 46107
gdf = copy(df)
gdf[:, :eras10 ] = eras10
gdf = groupby(filter(row -> row[:eras10] < 120, xdf), :eras10);
results10 = DataFrame(train_era = Int32[], test_era = Int32[], value = Float32[])

for train_era in keys(gdf)
    
    println(train_era[1])
    
    gdf₁ = gdf[train_era]
    model = LinearRegression()
    ScikitLearn.fit!(model, Matrix(gdf₁[!, names(gdf₁, features)]), gdf₁.target)
    
    for test_era in keys(gdf)
        
        gdf₂ = gdf[test_era]
        
        push!(results10, [train_era[1], 
                         test_era[1], 
                         cor(gdf₂.target, ScikitLearn.predict(model, Matrix(gdf₂[!, names(gdf₂, features)])))])
    end
end
0
10
20
30
40
50
60
70
80
90
100
110
results_df = unstack(results10, :test_era, :value)

12 rows × 13 columns (omitted printing of 6 columns)

train_era01020304050
Int32Float32?Float32?Float32?Float32?Float32?Float32?
100.146150.03212830.03540250.02876750.02219840.00701235
2100.04217590.1148130.02870590.02985040.03369370.00471899
3200.04314960.03349760.1130550.03662260.01674890.00565709
4300.03571690.03393070.03960310.1098840.04028880.0208269
5400.0357350.04171830.02046260.04034980.1002570.0144214
6500.0150320.009596670.006857220.02426850.01513260.104185
7600.006903660.01598510.004194660.01955850.0124050.00967648
8700.0342850.02522390.02203850.02853080.02321550.00198308
9800.03958260.02686820.01151860.02170910.01774720.00252007
10900.03282010.0290520.02292330.0313480.01998440.0100413
111000.02833810.01798350.02171980.009919350.007791320.0120947
121100.001810830.01835790.009415740.00670190.01473480.0164994
heatmap(clamp!(Matrix(select(results_df, Not(:train_era))), -.04, .04),  c=palette(:RdYlGn))

Here is an advanced paper that talks about generalization. Eras can be thought about in the same way that "distributions" or "environments" are talked about here https://arxiv.org/pdf/1907.02893.pdf

Gotcha: Since the signal-to-noise ratio is so low, models can take many more iterations than expected, and have scarily high in-sample performance

df₁ = df[coalesce.(eras .<= median(eras), false), :]
df₂ = df[coalesce.(eras .> median(eras), false), :];
function our_score(preds, dtrain)
    return "score", cor(get_info(dtrain, "label"), preds)
end

dtrain = DMatrix(Matrix(df₁[!, features]), label=df₁.target)
dtest = DMatrix(Matrix(df₂[!, features]), label=df₂.target)
dall = DMatrix(Matrix(df[!, features]), label=df.target);
# the source code shows only that it prints to stderr - one could redirect it to an IOBuffer and regex parse it into an
# array but realistically the amount of effort isn't worth it, since one can clearly see the out-of-sample performance 
# differneces purely from the numbers printed

param = Dict(
    "eta" => 0.1,
    "max_depth" => 3,
    "objective" => "reg:squarederror",
    "eval_metric" => "rmse"
)


xgboost(dtrain,
    100,
    param = param, 
    watchlist = [(dtrain, "train"), (dtest, "test")], 
    feval = our_score
)
Booster(Ptr{Nothing} @0x00000000c46a4790)

The results are sensitive to the choice of parameters, which should be picked through cross-validation

df₁ = df[coalesce.(eras .<= median(eras), false), :]
df₂ = df[coalesce.(eras .> median(eras), false), :];
XGB = @load XGBoostRegressor pkg=XGBoost verbosity=0
Linear = @load LinearRegressor pkg=MLJLinearModels verbosity=0
Elastic = @load ElasticNetRegressor pkg=MLJLinearModels verbosity=0
ElasticNetRegressor
models = vcat(
    [Linear()],
    [Elastic(lambda = λ) for λ in [0.01, 0.005, 0.002, 0.001, 0.0005, 0.0002, 0.0001, 0.00005, 0.00002, 0.00001]],
    [XGB()],
    [XGB(eta = 0.01, num_round=1000)],
    [XGB(eta = 0.01, colsample_bytree=0.1, num_round=1000)],
    [XGB(eta = 0.01, colsample_bytree=0.1, num_round=1000, max_depth=5)],
    [XGB(eta = 0.001, colsample_bytree=0.1, num_round=1000, max_depth=5)]
);
  0.056996 seconds (110.33 k allocations: 7.421 MiB, 102.88% compilation time)
for model in models
    print(" -- ", model, "\n")
    mach = machine(model, df₁[!, features], df₁.target)
    MLJ.fit!(mach, verbosity=0)
    outsample = numerai_score(df₂.target, MLJ.predict(mach, df₂[!, features]), df₂)
    insample = numerai_score(df₁.target, MLJ.predict(mach, df₁[!, features]), df₁)
    print("outsample: $outsample, insample: $insample", "\n")
end
 -- LinearRegressor @226
outsample: 0.028025599207339144, insample: 0.06275168899240204
 -- ElasticNetRegressor @411
outsample: 0.027651106932304964, insample: 0.061801048380003165
 -- ElasticNetRegressor @697
outsample: 0.027651115862462477, insample: 0.061800987594603896
 -- ElasticNetRegressor @825
outsample: 0.02765108322778513, insample: 0.06180104261606859
 -- ElasticNetRegressor @024
outsample: 0.027651162275441423, insample: 0.061801053881855784
 -- ElasticNetRegressor @510
outsample: 0.027651155706182137, insample: 0.06180105710588575
 -- ElasticNetRegressor @526
outsample: 0.027651162860262833, insample: 0.06180104657865728
 -- ElasticNetRegressor @754
outsample: 0.0276511455748371, insample: 0.061801044145696406
 -- ElasticNetRegressor @689
outsample: 0.02765113464601516, insample: 0.061801044145696406
 -- ElasticNetRegressor @244
outsample: 0.027651141667503418, insample: 0.06180103987617271
 -- ElasticNetRegressor @526
outsample: 0.027651141667503418, insample: 0.06180103987617271
 -- XGBoostRegressor @609
outsample: 0.024815763345799297, insample: 0.4033048140864821
 -- XGBoostRegressor @496
outsample: 0.03659074980295988, insample: 0.34474251174858644
 -- XGBoostRegressor @986
outsample: 0.03897987810857149, insample: 0.3118486054318112
 -- XGBoostRegressor @039
outsample: 0.039683297779925394, insample: 0.20312468355680283
 -- XGBoostRegressor @194
outsample: 0.034509192497652864, insample: 0.11544644084833998
1176.550459 seconds (57.51 M allocations: 143.696 GiB, 1.12% gc time, 0.19% compilation time)

Gotcha: Models with large exposures to individual features tend to perform poorly or inconsistently out of sample

XGB = @load XGBoostRegressor pkg=XGBoost verbosity=0
xgb = XGB(eta = 0.01, max_depth=5, num_round=1000);
mach = machine(xgb, df₁[!, features], df₁.target)
MLJ.fit!(mach, verbosity=0)

xgb_preds = MLJ.predict(mach, df₂[!, features]);
xgb_preds
248653-element Vector{Float32}:
 0.5092171
 0.51353323
 0.5298325
 0.50988734
 0.50764424
 0.50148475
 0.504006
 0.49748185
 0.49696985
 0.48918203
 0.50696886
 0.51324135
 0.48414978
 ⋮
 0.48918062
 0.47421244
 0.5090075
 0.48367783
 0.47870287
 0.5039986
 0.4987926
 0.49181792
 0.51567954
 0.5039868
 0.48160774
 0.48305735

Our predictions have correlation > 0.2 in either direction for some single features!

Sure hope those features continue to act as they have in the past!

cor_list = []
for feature in features
    append!(cor_list, cor(df₂[!, feature], xgb_preds))
end
    
describe(DataFrame(cor_list = cor_list), :all)

1 rows × 13 columns (omitted printing of 5 columns)

variablemeanstdminq25medianq75max
SymbolFloat64Float64Float64Float64Float64Float64Float64
1cor_list0.04841050.0816448-0.2297740.004178080.04579620.1086460.232515
function norm_neut(df, columns, feats, proportion=1.0)
    scores = quantile(Normal(0.0,1.0),(ordinalrank(df[!, columns]) .- 0.5) ./ length(df[!, columns]))
    exposures = Matrix(df[!, feats])
    neutralized = scores - proportion * exposures * (pinv(exposures) * scores)
    return neutralized / std(neutralized)
end;
df₂.preds = xgb_preds

df₂[:, :preds_neutralized] = combine(x -> norm_neut(x, :preds, features, 0.5), groupby(df₂, :erano)).x1

x_min = minimum(df₂.preds_neutralized)
x_max = maximum(df₂.preds_neutralized)
X_std = (df₂.preds_neutralized .- x_min) / (x_max .- x_min)
df₂[!, :preds_neutralized] = X_scaled = X_std * (1 - 0) .+ 0;
describe(df₂.preds_neutralized)
Summary Stats:
Length:         248653
Missing Count:  0
Mean:           0.512301
Minimum:        0.000000
1st Quartile:   0.445243
Median:         0.510633
3rd Quartile:   0.577324
Maximum:        1.000000
Type:           Float32

Now our single feature exposures are much smaller

cor_list2 = []
for feature in features
    append!(cor_list2, cor(df₂[!, feature], df₂.preds_neutralized))
end
    
describe(DataFrame(cor_list2 = cor_list2), :all)

1 rows × 13 columns (omitted printing of 5 columns)

variablemeanstdminq25medianq75max
SymbolFloat64Float64Float64Float64Float64Float64Float64
1cor_list20.03611280.05318-0.1468870.007757360.03374160.07533990.155376

Our overall score goes down, but the scores are more consistent than before. This leads to a higher sharpe

unbalanced_scores_per_era = combine(x -> cor(x.preds, x.target), groupby(df₂, :era))
balanced_scores_per_era = combine(x -> cor(x.preds_neutralized, x.target), groupby(df₂, :era));
println("score for high feature exposure: ", mean(unbalanced_scores_per_era.x1))
println("score for balanced feature expo: ", mean(balanced_scores_per_era.x1))

println("std for high feature exposure: ", std(unbalanced_scores_per_era.x1))
println("std for balanced feature expo: ", std(balanced_scores_per_era.x1))

println("sharpe for high feature exposure: ", mean(unbalanced_scores_per_era.x1)/std(unbalanced_scores_per_era.x1))
println("sharpe for balanced feature expo: ", mean(balanced_scores_per_era.x1)/std(balanced_scores_per_era.x1))
score for high feature exposure: 0.0368068530006
score for balanced feature expo: 0.03288299544568849
std for high feature exposure: 0.03848940885417861
std for balanced feature expo: 0.03158550581657646
sharpe for high feature exposure: 0.9562852248535912
sharpe for balanced feature expo: 1.0410786402043652
describe(balanced_scores_per_era.x1)
Summary Stats:
Length:         56
Missing Count:  0
Mean:           0.032883
Minimum:        -0.065154
1st Quartile:   0.013038
Median:         0.030797
3rd Quartile:   0.061884
Maximum:        0.091098
Type:           Float64
describe(unbalanced_scores_per_era.x1)
Summary Stats:
Length:         56
Missing Count:  0
Mean:           0.036807
Minimum:        -0.085174
1st Quartile:   0.014656
Median:         0.033550
3rd Quartile:   0.062175
Maximum:        0.112687
Type:           Float64