Numerai Analysis & Tips in Julia!
This post is a (near) 1:1 replication of the Python Jupyter notebook analysis & tips for the Numerai data science competition, but written in Julia
using DataFrames # Requires > 0.22.0 for rownums function
using CSV
using Statistics
using LinearAlgebra
using Plots
using StatsBase
using Distributions
using MLJ
using MLJLinearModels
using MLJXGBoostInterface
using XGBoost
import MLJBase: train_test_pairs
# Using for Logistic + CV options, also as an example of how to use Sklearn within Julia
using ScikitLearn
@sk_import linear_model: (LogisticRegression, LinearRegression)
@sk_import model_selection: (TimeSeriesSplit, KFold, GroupKFold, cross_val_score)
@sk_import metrics: make_scorer
df = CSV.File("numerai_training_data.csv") |> DataFrame
first(df,5)
size(df)
features = select(df, r"feature") |> names
df.erano = parse.(Int64, replace.(df.era, "era" => ""))
eras = df.erano
target = "target"
length(features)
feature_groups =
Dict(g => [c for c in features if startswith(c, "feature_$g")]
for g in ["intelligence", "wisdom", "charisma", "dexterity", "strength", "constitution"])
# There's probably (definitely) a better way to write this - [ordinalrank would solve the ranking]
function numerai_score(y_true, y_pred, df)
rank_pred = sort(combine(groupby(DataFrame(y_pred = y_pred
, eras = df.erano
, rnum = rownumber.(eachrow(df)))
, :eras)
, sdf -> sort(sdf, :y_pred)
, :eras => eachindex => :rank
, nrow => :n)
, :rnum)
rank_pred = rank_pred.rank ./ rank_pred.n
cor(y_true, rank_pred)
end
# It can also be convenient while working to evaluate based on the regular (pearson) correlation
# R2 Score to replicate the Python library outputs
function r2_score(y_true, y_pred)
@assert length(y_true) == length(y_pred)
ss_res = sum((y_true .- y_pred).^2)
mean = sum(y_true) / length(y_true)
ss_total = sum((y_true .- mean).^2)
return 1 - ss_res/(ss_total + eps(eltype(y_pred)))
end
# cor() returns a matrix with no need for manipulation, so no need to replicate that here
describe(df, :all, cols=:erano)
group_df = combine(groupby(df, :erano), nrow => :count)
plot(group_df.erano, group_df.count)
combine(groupby(df, :target), nrow => :count)
feature_corrs = DataFrame(cor(Matrix(df[!, names(df, features)])), features)
insertcols!(feature_corrs, 1, :features => features)
first(feature_corrs,5)
first(stack(feature_corrs), 5)
tdf = stack(feature_corrs)
tdf = tdf[coalesce.(tdf.variable .< tdf.features, false), :]
sort!(tdf, :value)
vcat(first(tdf, 5), last(tdf, 5))
df₁ = df[coalesce.(eras .<= median(eras), false), :]
df₂ = df[coalesce.(eras .> median(eras), false), :]
corr₁ = DataFrame(cor(Matrix(df₁[!, names(df₁, features)])), features)
insertcols!(corr₁, 1, :features => features)
corr₁ = stack(corr₁)
corr₁ = corr₁[coalesce.(corr₁.variable .< corr₁.features, false), :]
corr₂ = DataFrame(cor(Matrix(df₂[!, names(df₂, features)])), features)
insertcols!(corr₂, 1, :features => features)
corr₂ = stack(corr₂)
corr₂ = corr₂[coalesce.(corr₂.variable .< corr₂.features, false), :]
tdf = leftjoin(corr₁, corr₂, on = [:variable, :features], makeunique=true)
rename!(tdf, [:value, :value_1] .=> [:corr₁, :corr₂])
tdf.corr_diff = tdf.corr₂ - tdf.corr₁
sort!(tdf, :corr_diff)
vcat(first(tdf,5), last(tdf,5))
feature_scores =
Dict(feature => numerai_score(df.target, df[!, feature], df)
for feature in features);
sort(collect(feature_scores), by=x->x[2])
by_era_correlation =
sort(Dict(values(erano)[1] => cor(tdf.target, tdf.feature_strength34)
for (erano, tdf) in pairs(groupby(df, :erano))))
plot(by_era_correlation)
function rolling_mean(arr, n)
rs = cumsum(arr)[n:end] .- cumsum([0.0; arr])[1:end-n]
return rs ./ n
end
n_window = 10
plot(Dict(zip(collect(n_window-1:length(by_era_correlation)),
rolling_mean(collect(values(by_era_correlation)),n_window))))
df₁ = df[coalesce.(eras .<= median(eras), false), :]
df₂ = df[coalesce.(eras .> median(eras), false), :];
Linear = @load LinearRegressor pkg=MLJLinearModels verbosity=0
linear = Linear()
lin₁ = machine(linear, df₁[!, names(df₁, features)], df₁.target)
fit!(lin₁, verbosity=0)
lin₂ = machine(linear, df₂[!, names(df₂, features)], df₂.target)
fit!(lin₂, verbosity=0);
r2₁ = [
r2_score(dfₓ.target, MLJ.predict(model, dfₓ[!, names(dfₓ, features)]))
for dfₓ in [df1, df2]
for model in [lin1, lin2]]
DataFrame(reshape(r2₁, 2, 2), ["eval_on_1","eval_on_2"])
corrs = [
numerai_score(MLJ.predict(model, dfₓ[!, names(dfₓ, features)]), dfₓ.target, dfₓ)
for dfₓ in [df₁, df₂]
for model in [lin₁, lin₂]]
DataFrame(reshape(corrs, 2, 2), ["eval_on_1","eval_on_2"])
XGB = @load XGBoostRegressor pkg=XGBoost verbosity=0
xgb = XGB()
xgb₁ = machine(xgb, df₁[!, names(df₁, features)], df₁.target)
fit!(xgb₁, verbosity=0)
xgb₂ = machine(xgb, df₂[!, names(df₂, features)], df₂.target)
fit!(xgb₂, verbosity=0);
r2₂ = [
r2_score(dfₓ.target, MLJ.predict(model, dfₓ[!, names(dfₓ, features)]))
for dfₓ in [df₁, df₂]
for model in [xgb₁, xgb₂]]
DataFrame(reshape(r2₂, 2, 2), ["eval_on_1","eval_on_2"])
corrs2 = [
numerai_score(MLJ.predict(model, dfₓ[!, names(dfₓ, features)]), dfₓ.target, dfₓ)
for dfₓ in [df₁, df₂]
for model in [xgb₁, xgb₂]]
DataFrame(reshape(corrs2, 2, 2), ["eval_on_1","eval_on_2"])
logistic = LogisticRegression()
ScikitLearn.fit!(logistic, Matrix(df[!, names(df, features)]), convert.(Int, df.target*4))
ScikitLearn.score(logistic, Matrix(df[!, names(df, features)]), convert.(Int, df.target*4))
log_corrs = cor(transpose(ScikitLearn.predict_proba(logistic, Matrix(df[!, names(df, features)]))), dims=2)
display(log_corrs)
heatmap(log_corrs, c=palette(:RdYlGn))
prob_matrix = ScikitLearn.predict_proba(logistic, Matrix(df[!, names(df, features)]))
classes = logistic.classes_
numerai_score(df.target, prob_matrix * classes, df)
linear = LinearRegression()
ScikitLearn.fit!(linear, Matrix(df[!, names(df, features)]), df.target)
ScikitLearn.score(linear, Matrix(df[!, names(df, features)]), df.target)
preds = ScikitLearn.predict(linear, Matrix(df[!, names(df, features)]))
numerai_score(df.target, preds, df)
#ScikitLearn.fit!(linear, Matrix(df[!, names(df, features)]), df.target)
crossvalidators = [KFold(5), KFold(5, shuffle = true), GroupKFold(5), TimeSeriesSplit(5)]
for cv in crossvalidators
println(cv)
println(
mean(
cross_val_score(estimator = LinearRegression(),
X = Matrix(df[!, names(df, features)]),
y = df.target,
cv = cv,
groups = eras,
scoring = make_scorer(cor, greater_is_better = true)
)
)
)
end
eras10 = (eras .÷ 10) * 10
countmap(eras10)
gdf = copy(df)
gdf[:, :eras10 ] = eras10
gdf = groupby(filter(row -> row[:eras10] < 120, xdf), :eras10);
results10 = DataFrame(train_era = Int32[], test_era = Int32[], value = Float32[])
for train_era in keys(gdf)
println(train_era[1])
gdf₁ = gdf[train_era]
model = LinearRegression()
ScikitLearn.fit!(model, Matrix(gdf₁[!, names(gdf₁, features)]), gdf₁.target)
for test_era in keys(gdf)
gdf₂ = gdf[test_era]
push!(results10, [train_era[1],
test_era[1],
cor(gdf₂.target, ScikitLearn.predict(model, Matrix(gdf₂[!, names(gdf₂, features)])))])
end
end
results_df = unstack(results10, :test_era, :value)
heatmap(clamp!(Matrix(select(results_df, Not(:train_era))), -.04, .04), c=palette(:RdYlGn))
Here is an advanced paper that talks about generalization. Eras can be thought about in the same way that "distributions" or "environments" are talked about here https://arxiv.org/pdf/1907.02893.pdf
df₁ = df[coalesce.(eras .<= median(eras), false), :]
df₂ = df[coalesce.(eras .> median(eras), false), :];
function our_score(preds, dtrain)
return "score", cor(get_info(dtrain, "label"), preds)
end
dtrain = DMatrix(Matrix(df₁[!, features]), label=df₁.target)
dtest = DMatrix(Matrix(df₂[!, features]), label=df₂.target)
dall = DMatrix(Matrix(df[!, features]), label=df.target);
# the source code shows only that it prints to stderr - one could redirect it to an IOBuffer and regex parse it into an
# array but realistically the amount of effort isn't worth it, since one can clearly see the out-of-sample performance
# differneces purely from the numbers printed
param = Dict(
"eta" => 0.1,
"max_depth" => 3,
"objective" => "reg:squarederror",
"eval_metric" => "rmse"
)
xgboost(dtrain,
100,
param = param,
watchlist = [(dtrain, "train"), (dtest, "test")],
feval = our_score
)
df₁ = df[coalesce.(eras .<= median(eras), false), :]
df₂ = df[coalesce.(eras .> median(eras), false), :];
XGB = @load XGBoostRegressor pkg=XGBoost verbosity=0
Linear = @load LinearRegressor pkg=MLJLinearModels verbosity=0
Elastic = @load ElasticNetRegressor pkg=MLJLinearModels verbosity=0
models = vcat(
[Linear()],
[Elastic(lambda = λ) for λ in [0.01, 0.005, 0.002, 0.001, 0.0005, 0.0002, 0.0001, 0.00005, 0.00002, 0.00001]],
[XGB()],
[XGB(eta = 0.01, num_round=1000)],
[XGB(eta = 0.01, colsample_bytree=0.1, num_round=1000)],
[XGB(eta = 0.01, colsample_bytree=0.1, num_round=1000, max_depth=5)],
[XGB(eta = 0.001, colsample_bytree=0.1, num_round=1000, max_depth=5)]
);
for model in models
print(" -- ", model, "\n")
mach = machine(model, df₁[!, features], df₁.target)
MLJ.fit!(mach, verbosity=0)
outsample = numerai_score(df₂.target, MLJ.predict(mach, df₂[!, features]), df₂)
insample = numerai_score(df₁.target, MLJ.predict(mach, df₁[!, features]), df₁)
print("outsample: $outsample, insample: $insample", "\n")
end
XGB = @load XGBoostRegressor pkg=XGBoost verbosity=0
xgb = XGB(eta = 0.01, max_depth=5, num_round=1000);
mach = machine(xgb, df₁[!, features], df₁.target)
MLJ.fit!(mach, verbosity=0)
xgb_preds = MLJ.predict(mach, df₂[!, features]);
xgb_preds
cor_list = []
for feature in features
append!(cor_list, cor(df₂[!, feature], xgb_preds))
end
describe(DataFrame(cor_list = cor_list), :all)
function norm_neut(df, columns, feats, proportion=1.0)
scores = quantile(Normal(0.0,1.0),(ordinalrank(df[!, columns]) .- 0.5) ./ length(df[!, columns]))
exposures = Matrix(df[!, feats])
neutralized = scores - proportion * exposures * (pinv(exposures) * scores)
return neutralized / std(neutralized)
end;
df₂.preds = xgb_preds
df₂[:, :preds_neutralized] = combine(x -> norm_neut(x, :preds, features, 0.5), groupby(df₂, :erano)).x1
x_min = minimum(df₂.preds_neutralized)
x_max = maximum(df₂.preds_neutralized)
X_std = (df₂.preds_neutralized .- x_min) / (x_max .- x_min)
df₂[!, :preds_neutralized] = X_scaled = X_std * (1 - 0) .+ 0;
describe(df₂.preds_neutralized)
cor_list2 = []
for feature in features
append!(cor_list2, cor(df₂[!, feature], df₂.preds_neutralized))
end
describe(DataFrame(cor_list2 = cor_list2), :all)
unbalanced_scores_per_era = combine(x -> cor(x.preds, x.target), groupby(df₂, :era))
balanced_scores_per_era = combine(x -> cor(x.preds_neutralized, x.target), groupby(df₂, :era));
println("score for high feature exposure: ", mean(unbalanced_scores_per_era.x1))
println("score for balanced feature expo: ", mean(balanced_scores_per_era.x1))
println("std for high feature exposure: ", std(unbalanced_scores_per_era.x1))
println("std for balanced feature expo: ", std(balanced_scores_per_era.x1))
println("sharpe for high feature exposure: ", mean(unbalanced_scores_per_era.x1)/std(unbalanced_scores_per_era.x1))
println("sharpe for balanced feature expo: ", mean(balanced_scores_per_era.x1)/std(balanced_scores_per_era.x1))
describe(balanced_scores_per_era.x1)
describe(unbalanced_scores_per_era.x1)