ml_case_study/ml_case_study.bib at main · YalDan/ml_case_study · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

@article{josse_consistency_2020,
	title = {On the consistency of supervised learning with missing values},
	url = {http://arxiv.org/abs/1902.06931},
	abstract = {In many application settings, the data have missing entries which make analysis challenging. An abundant literature addresses missing values in an inferential framework: estimating parameters and their variance from incomplete tables. Here, we consider supervised-learning settings: predicting a target when missing values appear in both training and testing data. We show the consistency of two approaches in prediction. A striking result is that the widely-used method of imputing with a constant, such as the mean prior to learning is consistent when missing values are not informative. This contrasts with inferential settings where mean imputation is pointed at for distorting the distribution of the data. That such a simple approach can be consistent is important in practice. We also show that a predictor suited for complete observations can predict optimally on incomplete data,through multiple imputation.Finally, to compare imputation with learning directly with a model that accounts for missing values, we analyze further decision trees. These can naturally tackle empirical risk minimization with missing values, due to their ability to handle the half-discrete nature of incomplete variables. After comparing theoretically and empirically different missing values strategies in trees, we recommend using the "missing incorporated in attribute" method as it can handle both non-informative and informative missing values.},
	urldate = {2022-04-28},
	journal = {arXiv:1902.06931 [cs, math, stat]},
	author = {Josse, Julie and Prost, Nicolas and Scornet, Erwan and Varoquaux, Gaël},
	month = jul,
	year = {2020},
	note = {arXiv: 1902.06931},
	keywords = {Computer Science - Machine Learning, Mathematics - Statistics Theory, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/Users/danialsaef/Zotero/storage/9UZFXMUN/Josse et al. - 2020 - On the consistency of supervised learning with mis.pdf:application/pdf;arXiv.org Snapshot:/Users/danialsaef/Zotero/storage/5LR5DS43/1902.html:text/html},
}

@article{scornet_trees_2021,
	title = {Trees, forests, and impurity-based variable importance},
	url = {http://arxiv.org/abs/2001.04295},
	abstract = {Tree ensemble methods such as random forests [Breiman, 2001] are very popular to handle high-dimensional tabular data sets, notably because of their good predictive accuracy. However, when machine learning is used for decision-making problems, settling for the best predictive procedures may not be reasonable since enlightened decisions require an in-depth comprehension of the algorithm prediction process. Unfortunately, random forests are not intrinsically interpretable since their prediction results from averaging several hundreds of decision trees. A classic approach to gain knowledge on this so-called black-box algorithm is to compute variable importances, that are employed to assess the predictive impact of each input variable. Variable importances are then used to rank or select variables and thus play a great role in data analysis. Nevertheless, there is no justification to use random forest variable importances in such way: we do not even know what these quantities estimate. In this paper, we analyze one of the two well-known random forest variable importances, the Mean Decrease Impurity (MDI). We prove that if input variables are independent and in absence of interactions, MDI provides a variance decomposition of the output, where the contribution of each variable is clearly identified. We also study models exhibiting dependence between input variables or interaction, for which the variable importance is intrinsically ill-defined. Our analysis shows that there may exist some benefits to use a forest compared to a single tree.},
	urldate = {2022-04-28},
	journal = {arXiv:2001.04295 [math, stat]},
	author = {Scornet, Erwan},
	month = dec,
	year = {2021},
	note = {arXiv: 2001.04295},
	keywords = {Mathematics - Statistics Theory, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/Users/danialsaef/Zotero/storage/5PCVXWKS/Scornet - 2021 - Trees, forests, and impurity-based variable import.pdf:application/pdf;arXiv.org Snapshot:/Users/danialsaef/Zotero/storage/AMGSHDH5/2001.html:text/html},
}

@article{scornet_consistency_2015,
	title = {{CONSISTENCY} {OF} {RANDOM} {FORESTS}},
	volume = {43},
	issn = {0090-5364},
	url = {https://www.jstor.org/stable/43556658},
	abstract = {Random forests are a learning algorithm proposed by Breiman [Mach. Learn. 45 (2001) 5-32] that combines several randomized decision trees and aggregates their predictions by averaging. Despite its wide usage and outstanding practical performance, little is known about the mathematical properties of the procedure. This disparity between theory and practice originates in the difficulty to simultaneously analyze both the randomization process and the highly data-dependent tree structure. In the present paper, we take a step forward in forest exploration by proving a consistency result for Breiman's [Mach. Learn. 45 (2001) 5-32] original algorithm in the context of additive regression models. Our analysis also sheds an interesting light on how random forests can nicely adapt to sparsity.},
	number = {4},
	urldate = {2022-04-28},
	journal = {The Annals of Statistics},
	author = {Scornet, Erwan and Biau, Gérard and Vert, Jean-Philippe},
	year = {2015},
	note = {Publisher: Institute of Mathematical Statistics},
	pages = {1716--1741},
}

@incollection{hastie_boosting_2009,
	address = {New York, NY},
	series = {Springer {Series} in {Statistics}},
	title = {Boosting and {Additive} {Trees}},
	isbn = {978-0-387-84858-7},
	url = {https://doi.org/10.1007/978-0-387-84858-7_10},
	language = {en},
	urldate = {2022-04-28},
	booktitle = {The {Elements} of {Statistical} {Learning}: {Data} {Mining}, {Inference}, and {Prediction}},
	publisher = {Springer},
	author = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
	editor = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
	year = {2009},
	doi = {10.1007/978-0-387-84858-7_10},
	keywords = {Data Mining Application, Generalize Additive Model, Loss Function, Median House, Partial Dependence},
	pages = {337--387},
}

@article{diebold_comparing_2015,
	title = {Comparing {Predictive} {Accuracy}, {Twenty} {Years} {Later}: {A} {Personal} {Perspective} on the {Use} and {Abuse} of {Diebold}–{Mariano} {Tests}},
	volume = {33},
	issn = {0735-0015},
	shorttitle = {Comparing {Predictive} {Accuracy}, {Twenty} {Years} {Later}},
	url = {https://doi.org/10.1080/07350015.2014.983236},
	doi = {10.1080/07350015.2014.983236},
	abstract = {The Diebold–Mariano (DM) test was intended for comparing forecasts; it has been, and remains, useful in that regard. The DM test was not intended for comparing models. Much of the large ensuing literature, however, uses DM-type tests for comparing models, in pseudo-out-of-sample environments. In that case, simpler yet more compelling full-sample model comparison procedures exist; they have been, and should continue to be, widely used. The hunch that pseudo-out-of-sample analysis is somehow the “only,” or “best,” or even necessarily a “good” way to provide insurance against in-sample overfitting in model comparisons proves largely false. On the other hand, pseudo-out-of-sample analysis remains useful for certain tasks, perhaps most notably for providing information about comparative predictive performance during particular historical episodes.},
	number = {1},
	urldate = {2022-05-02},
	journal = {Journal of Business \& Economic Statistics},
	author = {Diebold, Francis X.},
	month = jan,
	year = {2015},
	note = {Publisher: Taylor \& Francis
\_eprint: https://doi.org/10.1080/07350015.2014.983236},
	keywords = {Forecasting, Model comparison, Model selection, Out-of-sample tests},
	pages = {1--1},
	file = {Snapshot:/Users/danialsaef/Zotero/storage/T58ZK4E9/07350015.2014.html:text/html;Volltext:/Users/danialsaef/Zotero/storage/FV3E624J/Diebold - 2015 - Comparing Predictive Accuracy, Twenty Years Later.pdf:application/pdf},
}