diff --git a/LaTeX/main.bib b/LaTeX/main.bib
index e327240..b912c2d 100644
--- a/LaTeX/main.bib
+++ b/LaTeX/main.bib
@@ -1,2453 +1,2417 @@
 @book{AbadirMagnus2005,
     author     = {Abadir, Karim M. and Magnus, Jan R.},
-    collection = {Econometric Exercises},
-    doi        = {10.1017/CBO9780511810800},
-    place      = {Cambridge},
-    publisher  = {Cambridge University Press},
-    series     = {Econometric Exercises},
+    year       = {2005},
     title      = {Matrix Algebra},
-    year       = {2005}
+    publisher  = {Cambridge University Press},
+    doi        = {10.1017/CBO9780511810800},
+    collection = {Econometric Exercises},
+    place      = {Cambridge},
+    series     = {Econometric Exercises}
 }
 
 @book{AbsilEtAl2007,
     author    = {Absil, P.-A. and Mahony, R. and Sepulchre, R.},
+    year      = {2008},
+    title     = {{Optimization Algorithms on Matrix Manifolds}},
+    pages     = {xvi+224},
+    publisher = {Princeton University Press, Princeton, NJ},
     doi       = {10.1515/9781400830244},
     isbn      = {978-0-691-13298-3},
     mrclass   = {90-02 (58E17 90C30 90C52)},
     mrnumber  = {2364186},
-    note      = {Full Online Text \url{https://press.princeton.edu/absil}},
-    pages     = {xvi+224},
-    publisher = {Princeton University Press, Princeton, NJ},
-    title     = {{Optimization Algorithms on Matrix Manifolds}},
-    url       = {https://doi.org/10.1515/9781400830244},
-    year      = {2008}
+    note      = {Full Online Text \url{https://press.princeton.edu/absil}}
 }
 
 @article{AdragniCook2009,
     author   = {Adragni, Kofi P. and Cook, R. Dennis},
-    doi      = {10.1098/rsta.2009.0110},
-    fjournal = {Philosophical Transactions of the Royal Society of London. Series A. Mathematical, Physical and Engineering Sciences},
-    issn     = {1364-503X,1471-2962},
+    year     = {2009},
+    title    = {Sufficient dimension reduction and prediction in regression},
     journal  = {Philos. Trans. R. Soc. Lond. Ser. A Math. Phys. Eng. Sci.},
-    mrclass  = {62J02 (62H25)},
-    mrnumber = {2546393},
+    volume   = {367},
     number   = {1906},
     pages    = {4385--4405},
-    title    = {Sufficient dimension reduction and prediction in regression},
-    url      = {https://doi.org/10.1098/rsta.2009.0110},
-    volume   = {367},
-    year     = {2009}
+    issn     = {1364-503X,1471-2962},
+    doi      = {10.1098/rsta.2009.0110},
+    fjournal = {Philosophical Transactions of the Royal Society of London. Series A. Mathematical, Physical and Engineering Sciences},
+    mrclass  = {62J02 (62H25)},
+    mrnumber = {2546393}
 }
 
 @book{Anderson2003,
-    address   = {New York, NY},
     author    = {T. W. Anderson},
-    edition   = {third},
-    publisher = {Wiley},
+    year      = {2003},
     title     = {An Introduction to Multivariate Statistical Analysis},
-    year      = {2003}
+    publisher = {Wiley},
+    address   = {New York, NY},
+    edition   = {third}
 }
 
 @book{Arnold1981,
-    address   = {New York, NY [u.a.]},
     author    = {Arnold, Steven F},
+    year      = {1981},
+    title     = {The theory of linear models and multivariate analysis},
+    publisher = {Wiley},
+    address   = {New York, NY [u.a.]},
     isbn      = {0471050652},
     keywords  = {Multivariate Analyse},
     language  = {eng},
-    publisher = {Wiley},
-    series    = {Wiley series in probability and mathematical statistics : Probability and mathematical statistics},
-    title     = {The theory of linear models and multivariate analysis},
-    year      = {1981}
+    series    = {Wiley series in probability and mathematical statistics : Probability and mathematical statistics}
 }
 
 @article{BanerjeeEtAl2008,
     author  = {Onureena Banerjee and Laurent El Ghaoui and Alexandre d'Aspremont},
+    year    = {2008},
+    title   = {Model Selection Through Sparse Maximum Likelihood Estimation for Multivariate Gaussian or Binary Data},
     journal = {Journal of Machine Learning Research},
+    volume  = {9},
     number  = {15},
     pages   = {485-516},
-    title   = {Model Selection Through Sparse Maximum Likelihood Estimation for Multivariate Gaussian or Binary Data},
-    url     = {http://jmlr.org/papers/v9/banerjee08a.html},
-    volume  = {9},
-    year    = {2008}
+    url     = {http://jmlr.org/papers/v9/banerjee08a.html}
 }
 
 @article{BasserPajevic2000,
     author  = {Basser, Peter J. and Pajevic, Sinisa},
-    doi     = {10.1002/1522-2594(200007)44:1<41::AID-MRM8>3.0.CO;2-O},
+    year    = {2000},
+    title   = {Statistical artifacts in diffusion tensor MRI (DT-MRI) caused by background noise},
     journal = {Magnetic Resonance in Medicine},
+    volume  = {44},
     number  = {1},
     pages   = {41-50},
-    title   = {Statistical artifacts in diffusion tensor MRI (DT-MRI) caused by background noise},
-    url     = {https://onlinelibrary.wiley.com/doi/abs/10.1002/1522-2594%28200007%2944%3A1%3C41%3A%3AAID-MRM8%3E3.0.CO%3B2-O},
-    volume  = {44},
-    year    = {2000}
+    doi     = {10.1002/1522-2594(200007)44:1<41::AID-MRM8>3.0.CO;2-O}
 }
 
 @article{BasserPajevic2003,
     author    = {Basser, Peter J and Pajevic, Sinisa},
+    year      = {2003},
+    title     = {A normal distribution for tensor-valued random variables: applications to diffusion tensor MRI},
     journal   = {IEEE transactions on medical imaging},
+    volume    = {22},
     number    = {7},
     pages     = {785--794},
-    publisher = {IEEE},
-    title     = {A normal distribution for tensor-valued random variables: applications to diffusion tensor MRI},
-    volume    = {22},
-    year      = {2003}
+    publisher = {IEEE}
 }
 
 @article{BasserPajevic2007,
     author  = {Peter J. Basser and Sinisa Pajevic},
-    doi     = {10.1016/j.sigpro.2006.02.050},
-    issn    = {0165-1684},
+    year    = {2007},
+    title   = {Spectral decomposition of a 4th-order covariance tensor: Applications to diffusion tensor MRI},
     journal = {Signal Processing},
-    note    = {Tensor Signal Processing},
+    volume  = {87},
     number  = {2},
     pages   = {220-236},
-    title   = {Spectral decomposition of a 4th-order covariance tensor: Applications to diffusion tensor MRI},
-    url     = {https://www.sciencedirect.com/science/article/pii/S0165168406001678},
-    volume  = {87},
-    year    = {2007}
+    issn    = {0165-1684},
+    doi     = {10.1016/j.sigpro.2006.02.050},
+    note    = {Tensor Signal Processing}
 }
 
 @article{Besag1974,
     author  = {Besag, Julian},
-    doi     = {10.1111/j.2517-6161.1974.tb00999.x},
+    year    = {1974},
+    title   = {Spatial Interaction and the Statistical Analysis of Lattice Systems},
     journal = {Journal of the Royal Statistical Society: Series B (Methodological)},
+    volume  = {36},
     number  = {2},
     pages   = {192-225},
-    title   = {Spatial Interaction and the Statistical Analysis of Lattice Systems},
-    url     = {https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.2517-6161.1974.tb00999.x},
-    volume  = {36},
-    year    = {1974}
+    doi     = {10.1111/j.2517-6161.1974.tb00999.x}
 }
 
 @incollection{Bottou1998,
-    address   = {Cambridge, UK},
     author    = {Bottou, L\'{e}on},
+    year      = {1998},
+    title     = {Online Algorithms and Stochastic Approximations},
+    publisher = {Cambridge University Press},
+    address   = {Cambridge, UK},
     booktitle = {Online Learning and Neural Networks},
     editor    = {Saad, David},
-    note      = {\url{http://leon.bottou.org/papers/bottou-98x} revised, Oct 2012},
-    publisher = {Cambridge University Press},
-    title     = {Online Algorithms and Stochastic Approximations},
-    year      = {1998}
+    note      = {\url{http://leon.bottou.org/papers/bottou-98x} revised, Oct 2012}
 }
 
 @article{Brenner2018,
     author        = {Brenner, A.V. and Inskip, P.D. and Rusiecki, J. and Rabkin, C.S. and Engels, J. and Pfeiffer, R.M.},
-    document_type = {Article},
-    doi           = {10.1038/s41416-018-0272-x},
+    year          = {2018},
+    title         = {Serially measured pre-diagnostic levels of serum cytokines and risk of brain cancer in active component military personnel},
     journal       = {British Journal of Cancer},
+    volume        = {119},
     number        = {7},
     pages         = {893-900},
-    title         = {Serially measured pre-diagnostic levels of serum cytokines and risk of brain cancer in active component military personnel},
-    volume        = {119},
-    year          = {2018}
+    doi           = {10.1038/s41416-018-0272-x},
+    document_type = {Article}
 }
 
 @article{BrownEtAl2001,
     author  = {Brown, P. J. and Kenward, M. G. and Bassett, E. E.},
-    doi     = {10.1093/biostatistics/2.4.417},
-    issn    = {1465-4644},
+    year    = {2001},
+    title   = {Bayesian discrimination with longitudinal data},
     journal = {Biostatistics},
-    month   = {12},
+    volume  = {2},
     number  = {4},
     pages   = {417-432},
-    title   = {Bayesian discrimination with longitudinal data},
-    volume  = {2},
-    year    = {2001}
+    issn    = {1465-4644},
+    doi     = {10.1093/biostatistics/2.4.417},
+    month   = {12}
 }
 
 @article{BuraCook2001a,
     author    = {Efstathia Bura and R. Dennis Cook},
-    doi       = {10.1198/016214501753208979},
+    year      = {2001},
+    title     = {Extending Sliced Inverse Regression},
     journal   = {Journal of the American Statistical Association},
+    volume    = {96},
     number    = {455},
     pages     = {996-1003},
     publisher = {Taylor & Francis},
-    title     = {Extending Sliced Inverse Regression},
-    volume    = {96},
-    year      = {2001}
+    doi       = {10.1198/016214501753208979}
 }
 
 @article{BuraCook2001b,
     author    = {Bura, Efstathia and Cook, R. Dennis},
-    fjournal  = {Journal of the Royal Statistical Society. Series B: Statistical Methodology},
-    issn      = {1369-7412},
+    year      = {2001},
+    title     = {Estimating the structural dimension of regressions via parametric inverse regression},
     journal   = {J. R. Stat. Soc. Ser. B Stat. Methodol.},
+    volume    = {63},
     number    = {2},
     pages     = {393--410},
     publisher = {Wiley-Blackwell},
-    title     = {Estimating the structural dimension of regressions via parametric inverse regression},
-    volume    = {63},
-    year      = {2001}
+    issn      = {1369-7412},
+    fjournal  = {Journal of the Royal Statistical Society. Series B: Statistical Methodology}
 }
 
 @article{BuraDuarteForzani2016,
     author    = {Efstathia Bura and Sabrina Duarte and Liliana Forzani},
-    doi       = {10.1080/01621459.2015.1093944},
-    fjournal  = {Journal of the American Statistical Association},
+    year      = {2016},
+    title     = {Sufficient Reductions in Regressions With Exponential Family Inverse Predictors},
     journal   = {J. Amer. Statist. Assoc.},
+    volume    = {111},
     number    = {515},
     pages     = {1313--1329},
     publisher = {Taylor \& Francis},
-    title     = {Sufficient Reductions in Regressions With Exponential Family Inverse Predictors},
-    volume    = {111},
-    year      = {2016}
+    doi       = {10.1080/01621459.2015.1093944},
+    fjournal  = {Journal of the American Statistical Association}
 }
 
 @article{BuraEtAl2018,
     author    = {Bura, Efstathia and Duarte, Sabrina and Forzani, Liliana and E. Smucler and M. Sued},
-    doi       = {10.1080/02331888.2018.1467420},
-    fjournal  = {Statistics. A Journal of Theoretical and Applied Statistics},
+    year      = {2018},
+    title     = {Asymptotic theory for maximum likelihood estimates in reduced-rank multivariate generalized linear models},
     journal   = {Statistics},
+    volume    = {52},
     number    = {5},
     pages     = {1005--1024},
     publisher = {Taylor \& Francis},
-    title     = {Asymptotic theory for maximum likelihood estimates in reduced-rank multivariate generalized linear models},
-    volume    = {52},
-    year      = {2018}
+    doi       = {10.1080/02331888.2018.1467420},
+    fjournal  = {Statistics. A Journal of Theoretical and Applied Statistics}
 }
 
 @article{BuraEtAl2022,
     author   = {Bura, Efstathia and Forzani, Liliana and Garc\'{i}a Arancibia, Rodrigo and Llop, Pamela and Tomassi, Diego},
-    fjournal = {Journal of Machine Learning Research (JMLR)},
-    issn     = {1532-4435,1533-7928},
+    year     = {2022},
+    title    = {Sufficient reductions in regression with mixed predictors},
     journal  = {J. Mach. Learn. Res.},
-    mrclass  = {62H12 (62H22 62H25 62J02 65C20)},
-    mrnumber = {4576687},
+    volume   = {23},
     number   = {102},
     pages    = {1--47},
-    title    = {Sufficient reductions in regression with mixed predictors},
+    issn     = {1532-4435,1533-7928},
     url      = {http://jmlr.org/papers/v23/21-0175.html},
-    volume   = {23},
-    year     = {2022}
+    fjournal = {Journal of Machine Learning Research (JMLR)},
+    mrclass  = {62H12 (62H22 62H25 62J02 65C20)},
+    mrnumber = {4576687}
 }
 
 @article{BuraForzani2015,
     author    = {Bura, Efstathia and Forzani, Liliana},
-    doi       = {10.1080/01621459.2014.914440},
-    fjournal  = {Journal of the American Statistical Association},
-    issn      = {0162-1459,1537-274X},
+    year      = {2015},
+    title     = {{Sufficient Reductions in Regressions With Elliptically Contoured Inverse Predictors}},
     journal   = {J. Amer. Statist. Assoc.},
-    mrclass   = {62J02 (62J05 62P10)},
+    volume    = {110},
     number    = {509},
     pages     = {420--434},
     publisher = {Taylor \& Francis},
-    title     = {{Sufficient Reductions in Regressions With Elliptically Contoured Inverse Predictors}},
-    volume    = {110},
-    year      = {2015}
+    issn      = {0162-1459,1537-274X},
+    doi       = {10.1080/01621459.2014.914440},
+    fjournal  = {Journal of the American Statistical Association},
+    mrclass   = {62J02 (62J05 62P10)}
 }
 
 @article{Burdick1995,
     author  = {Donald S. Burdick},
-    doi     = {10.1016/0169-7439(95)80060-M},
-    issn    = {0169-7439},
+    year    = {1995},
+    title   = {An introduction to tensor products with applications to multiway data analysis},
     journal = {Chemometrics and Intelligent Laboratory Systems},
+    volume  = {28},
     number  = {2},
     pages   = {229-237},
-    title   = {An introduction to tensor products with applications to multiway data analysis},
-    url     = {https://www.sciencedirect.com/science/article/pii/016974399580060M},
-    volume  = {28},
-    year    = {1995}
+    issn    = {0169-7439},
+    doi     = {10.1016/0169-7439(95)80060-M}
 }
 
 @article{Burges2010,
     author  = {Christopher J. C. Burges},
-    doi     = {10.1561/2200000002},
-    issn    = {1935-8237},
+    year    = {2010},
+    title   = {Dimension Reduction: A Guided Tour},
     journal = {Foundations and Trends® in Machine Learning},
+    volume  = {2},
     number  = {4},
     pages   = {275-365},
-    title   = {Dimension Reduction: A Guided Tour},
-    volume  = {2},
-    year    = {2010}
+    issn    = {1935-8237},
+    doi     = {10.1561/2200000002}
 }
 
 @article{Bury2013,
     author  = {Thomas Bury},
-    doi     = {10.1016/j.physa.2012.10.046},
-    issn    = {0378-4371},
+    year    = {2013},
+    title   = {Market structure explained by pairwise interactions},
     journal = {Physica A: Statistical Mechanics and its Applications},
+    volume  = {392},
     number  = {6},
     pages   = {1375-1385},
-    title   = {Market structure explained by pairwise interactions},
-    volume  = {392},
-    year    = {2013}
+    issn    = {0378-4371},
+    doi     = {10.1016/j.physa.2012.10.046}
 }
 
 @article{CandesEtAl2008,
     author   = {Cand\`es, Emmanuel J. and Wakin, Michael B. and Boyd, Stephen P.},
-    doi      = {10.1007/s00041-008-9045-x},
-    fjournal = {The Journal of Fourier Analysis and Applications},
-    issn     = {1069-5869,1531-5851},
+    year     = {2008},
+    title    = {Enhancing sparsity by reweighted {$l_1$} minimization},
     journal  = {J. Fourier Anal. Appl.},
-    mrclass  = {90C25 (90C27 94A12)},
-    mrnumber = {2461611},
+    volume   = {14},
     number   = {5-6},
     pages    = {877--905},
-    title    = {Enhancing sparsity by reweighted {$l_1$} minimization},
-    url      = {https://doi.org/10.1007/s00041-008-9045-x},
-    volume   = {14},
-    year     = {2008}
+    issn     = {1069-5869,1531-5851},
+    doi      = {10.1007/s00041-008-9045-x},
+    fjournal = {The Journal of Fourier Analysis and Applications},
+    mrclass  = {90C25 (90C27 94A12)},
+    mrnumber = {2461611}
 }
 
 @article{CarrollLi1995,
     author   = {Carroll, Raymond James and Li, Ker-Chau},
-    fjournal = {Statistica Sinica},
-    issn     = {1017-0405,1996-8507},
+    year     = {1995},
+    title    = {Binary regressors in dimension reduction models: a new look at treatment comparisons},
     journal  = {Statist. Sinica},
-    mrclass  = {62J02 (62G05)},
-    mrnumber = {1347614},
+    volume   = {5},
     number   = {2},
     pages    = {667--688},
-    title    = {Binary regressors in dimension reduction models: a new look at treatment comparisons},
+    issn     = {1017-0405,1996-8507},
     url      = {https://api.semanticscholar.org/CorpusID:1648354},
-    volume   = {5},
-    year     = {1995}
+    fjournal = {Statistica Sinica},
+    mrclass  = {62J02 (62G05)},
+    mrnumber = {1347614}
 }
 
 @book{CasellaBerger2002,
     author    = {Casella, George and Berger, Roger L.},
+    year      = {2002},
+    title     = {{Statistical Inference}},
+    publisher = {Thomson Learning},
     edition   = {2},
     isbn      = {0-534-24312-6},
-    publisher = {Thomson Learning},
-    series    = {Duxbury Advanced Series},
-    title     = {{Statistical Inference}},
-    year      = {2002}
+    series    = {Duxbury Advanced Series}
 }
 
 @article{ChakrabortyEtAl2022,
     author  = {Anirban Chakraborty and Matthias Katzfuss and Joseph Guinness},
-    doi     = {10.1016/j.spasta.2022.100708},
-    issn    = {2211--6753},
-    journal = {Spatial Statistics},
-    pages   = {100708},
+    year    = {2022},
     title   = {Ordered conditional approximation of Potts models},
-    url     = {https://www.sciencedirect.com/science/article/pii/S2211675322000690},
+    journal = {Spatial Statistics},
     volume  = {52},
-    year    = {2022}
+    pages   = {100708},
+    issn    = {2211--6753},
+    doi     = {10.1016/j.spasta.2022.100708}
 }
 
 @article{ChenEtAl2021,
     author  = {Chen, You-Lin and Kolar, Mladen and Tsay, Ruey S.},
-    doi     = {10.1080/10618600.2020.1856118},
+    year    = {2021},
+    title   = {Tensor Canonical Correlation Analysis With Convergence and Statistical Guarantees},
     journal = {Journal of Computational and Graphical Statistics},
+    volume  = {30},
     number  = {3},
     pages   = {728--744},
-    title   = {Tensor Canonical Correlation Analysis With Convergence and Statistical Guarantees},
-    volume  = {30},
-    year    = {2021}
+    doi     = {10.1080/10618600.2020.1856118}
 }
 
 @article{ChengEtAl2014,
     author  = {Cheng, Jie and Levina, Elizaveta and Wang, Pei and Zhu, Ji},
-    doi     = {10.1111/biom.12202},
+    year    = {2014},
+    title   = {A sparse Ising model with covariates},
     journal = {Biometrics},
+    volume  = {70},
     number  = {4},
     pages   = {943-953},
-    title   = {A sparse Ising model with covariates},
-    volume  = {70},
-    year    = {2014}
+    doi     = {10.1111/biom.12202}
 }
 
 @article{ChenZouCook2010,
     author    = {Chen, Xin and Zou, Changliang and Cook, R. Dennis},
-    doi       = {10.1214/10-AOS826},
-    fjournal  = {The Annals of Statistics},
+    year      = {2010},
+    title     = {Coordinate-independent sparse sufficient dimension reduction and variable selection},
     journal   = {Ann. Statist.},
-    month     = {12},
+    volume    = {38},
     number    = {6},
     pages     = {3696--3723},
     publisher = {The Institute of Mathematical Statistics},
-    title     = {Coordinate-independent sparse sufficient dimension reduction and variable selection},
-    url       = {https://doi.org/10.1214/10-AOS826},
-    volume    = {38},
-    year      = {2010}
+    doi       = {10.1214/10-AOS826},
+    fjournal  = {The Annals of Statistics},
+    month     = {12}
 }
 
 @article{ChiaroCookLi2002,
     author    = {Chiaromonte, F. and Cook, R. Dennis and Li, B.},
-    fjournal  = {The Annals of Statistics},
-    issue     = {2},
+    year      = {2002},
+    title     = {Sufficient dimension reduction in regressions with categorical predictors},
     journal   = {Ann. Statist.},
+    volume    = {30},
     pages     = {475-497},
     publisher = {The Institute of Mathematical Statistics},
-    title     = {Sufficient dimension reduction in regressions with categorical predictors},
-    volume    = {30},
-    year      = {2002}
+    fjournal  = {The Annals of Statistics},
+    issue     = {2}
 }
 
 @article{ChiaromonteCook2002,
     author   = {Chiaromonte, Francesca and Cook, R. Dennis},
-    doi      = {10.1023/A:1022411301790},
-    fjournal = {Annals of the Institute of Statistical Mathematics},
-    issn     = {0020-3157,1572-9052},
+    year     = {2002},
+    title    = {Sufficient dimension reduction and graphics in regression},
     journal  = {Ann. Inst. Statist. Math.},
-    mrclass  = {62J05 (62H99)},
-    mrnumber = {1954046},
+    volume   = {54},
     number   = {4},
     pages    = {768--795},
-    title    = {Sufficient dimension reduction and graphics in regression},
-    url      = {https://doi.org/10.1023/A:1022411301790},
-    volume   = {54},
-    year     = {2002}
+    issn     = {0020-3157,1572-9052},
+    doi      = {10.1023/A:1022411301790},
+    fjournal = {Annals of the Institute of Statistical Mathematics},
+    mrclass  = {62J05 (62H99)},
+    mrnumber = {1954046}
 }
 
 @article{ClevelandDevlin1988,
     author    = {William S. Cleveland and Susan J. Devlin},
-    doi       = {10.1080/01621459.1988.10478639},
-    fjournal  = {Journal of the American Statistical Association},
+    year      = {1988},
+    title     = {{Locally Weighted Regression: An Approach to Regression Analysis by Local Fitting}},
     journal   = {J. Amer. Statist. Assoc.},
+    volume    = {83},
     number    = {403},
     pages     = {596--610},
     publisher = {Taylor \& Francis},
-    title     = {{Locally Weighted Regression: An Approach to Regression Analysis by Local Fitting}},
-    volume    = {83},
-    year      = {1988}
+    doi       = {10.1080/01621459.1988.10478639},
+    fjournal  = {Journal of the American Statistical Association}
 }
 
 @incollection{Comon2002,
     author    = {Comon, Pierre},
-    booktitle = {{Mathematics in Signal Processing V}},
+    year      = {2002},
+    title     = {{Tensor Decompositions: State of the Art and Applications}},
+    publisher = {Oxford University Press},
     doi       = {10.1093/oso/9780198507345.003.0001},
     eprint    = {https://academic.oup.com/book/0/chapter/422056726/chapter-pdf/52392862/isbn-9780198507345-book-part-1.pdf},
+    booktitle = {{Mathematics in Signal Processing V}},
     isbn      = {9780198507345},
-    month     = {06},
-    publisher = {Oxford University Press},
-    title     = {{Tensor Decompositions: State of the Art and Applications}},
-    year      = {2002}
+    month     = {06}
 }
 
 @inproceedings{Comon2009,
     author    = {Comon, Pierre},
-    booktitle = {2009 IEEE/SP 15th Workshop on Statistical Signal Processing},
-    doi       = {10.1109/SSP.2009.5278471},
-    number    = {},
-    pages     = {781-788},
+    year      = {2009},
     title     = {Tensors versus matrices usefulness and unexpected properties},
     volume    = {},
-    year      = {2009}
+    number    = {},
+    pages     = {781-788},
+    doi       = {10.1109/SSP.2009.5278471},
+    booktitle = {2009 IEEE/SP 15th Workshop on Statistical Signal Processing}
 }
 
 @book{Conway1997,
-    address = {New York},
     author  = {Conway, John B.},
+    year    = {1997},
+    title   = {A Course in Functional Analysis},
+    number  = {96},
+    address = {New York},
     edition = {2nd ed},
     isbn    = {978-0-387-97245-9},
-    number  = {96},
-    series  = {Graduate Texts in Mathematics},
-    title   = {A Course in Functional Analysis},
-    year    = {1997}
+    series  = {Graduate Texts in Mathematics}
 }
 
 @articla{Cook1994,
     author  = {Cook, Dennis R.},
-    journal = {Proc. Sect. Phys. Eng. Sci.},
-    pages   = {18--25},
+    year    = {1994},
     title   = {Using dimension-reduction subspaces to identify important inputs in models of physical systems},
-    year    = {1994}
+    journal = {Proc. Sect. Phys. Eng. Sci.},
+    pages   = {18--25}
 }
 
 @book{Cook1998,
-    address   = {New York},
     author    = {Cook, Dennis R.},
-    publisher = {Wiley},
+    year      = {1998},
     title     = {Regression Graphics: Ideas for studying regressions through graphics},
-    year      = {1998}
+    publisher = {Wiley},
+    address   = {New York}
 }
 
 @article{Cook2000,
     author    = {R. Dennis Cook},
-    doi       = {10.1080/03610920008832598},
+    year      = {2000},
+    title     = {Save: a method for dimension reduction and graphics in regression},
     journal   = {Communications in Statistics - Theory and Methods},
+    volume    = {29},
     number    = {9-10},
     pages     = {2109-2121},
     publisher = {Taylor \& Francis},
-    title     = {Save: a method for dimension reduction and graphics in regression},
-    url       = {https://doi.org/10.1080/03610920008832598},
-    volume    = {29},
-    year      = {2000}
+    doi       = {10.1080/03610920008832598}
 }
 
 @article{Cook2007,
     author    = {Cook, R. Dennis},
-    doi       = {10.1214/088342306000000682},
+    year      = {2007},
+    title     = {{Fisher Lecture: Dimension Reduction in Regression}},
     journal   = {Statistical Science},
-    month     = {02},
+    volume    = {22},
     number    = {1},
     pages     = {1--26},
     publisher = {The Institute of Mathematical Statistics},
-    title     = {{Fisher Lecture: Dimension Reduction in Regression}},
-    volume    = {22},
-    year      = {2007}
+    doi       = {10.1214/088342306000000682},
+    month     = {02}
 }
 
 @article{Cook2018,
     author  = {Cook, R. Dennis},
-    doi     = {10.1146/annurev-statistics-031017-100257},
+    year    = {2018},
+    title   = {Principal Components, Sufficient Dimension Reduction, and Envelopes},
     journal = {Annual Review of Statistics and Its Application},
+    volume  = {5},
     number  = {1},
     pages   = {533-559},
-    title   = {Principal Components, Sufficient Dimension Reduction, and Envelopes},
-    volume  = {5},
-    year    = {2018}
+    doi     = {10.1146/annurev-statistics-031017-100257}
 }
 
 @article{CookForzani2008,
     author  = {Cook, R. D. and Forzani, L.},
-    journal = {Statistical Science},
-    number  = {4},
-    pages   = {485-501},
+    year    = {2008},
     title   = {Principal fitted components for dimension reduction in regression},
+    journal = {Statistical Science},
     volume  = {23},
-    year    = {2008}
+    number  = {4},
+    pages   = {485-501}
 }
 
 @article{CookForzani2009,
     author    = {R. Dennis Cook and Liliana Forzani},
-    doi       = {10.1198/jasa.2009.0106},
-    issn      = {0162-1459},
+    year      = {2009},
+    title     = {Likelihood-based sufficient dimension reduction},
     journal   = {Journal of the American Statistical Association},
-    month     = {3},
+    volume    = {104},
     number    = {485},
     pages     = {197--208},
     publisher = {Taylor and Francis Ltd.},
-    title     = {Likelihood-based sufficient dimension reduction},
-    volume    = {104},
-    year      = {2009}
+    issn      = {0162-1459},
+    doi       = {10.1198/jasa.2009.0106},
+    month     = {3}
 }
 
 @article{CookLi2002,
     author    = {Cook, R.D. and Li, B.},
-    doi       = {10.1214/aos/1021379861},
-    fjournal  = {The Annals of Statistics},
+    year      = {2002},
+    title     = {Dimension reduction for conditional mean in regression},
     journal   = {The Annals of Statistics},
+    volume    = {30},
     number    = {2},
     pages     = {455--474},
     publisher = {The Institute of Mathematical Statistics},
-    title     = {Dimension reduction for conditional mean in regression},
-    volume    = {30},
-    year      = {2002}
+    doi       = {10.1214/aos/1021379861},
+    fjournal  = {The Annals of Statistics}
 }
 
 @article{CookLi2004,
     author   = {Cook, R. Dennis and Li, Bing},
-    doi      = {10.1214/009053604000000661},
-    fjournal = {The Annals of Statistics},
-    issn     = {0090-5364,2168-8966},
+    year     = {2004},
+    title    = {Determining the dimension of iterative {H}essian transformation},
     journal  = {Ann. Statist.},
-    mrclass  = {62G08 (62G09 62H05)},
-    mrnumber = {2153993},
+    volume   = {32},
     number   = {6},
     pages    = {2501--2531},
-    title    = {Determining the dimension of iterative {H}essian transformation},
-    url      = {https://doi.org/10.1214/009053604000000661},
-    volume   = {32},
-    year     = {2004}
+    issn     = {0090-5364,2168-8966},
+    doi      = {10.1214/009053604000000661},
+    fjournal = {The Annals of Statistics},
+    mrclass  = {62G08 (62G09 62H05)},
+    mrnumber = {2153993}
 }
 
 @article{CookLi2009,
     author   = {Cook, R. Dennis and Li, Lexin},
-    doi      = {10.1198/jcgs.2009.08005},
-    fjournal = {Journal of Computational and Graphical Statistics},
-    issn     = {1061-8600,1537-2715},
+    year     = {2009},
+    title    = {Dimension reduction in regressions with exponential family predictors},
     journal  = {J. Comput. Graph. Statist.},
-    mrclass  = {62J05},
-    mrnumber = {2572637},
+    volume   = {18},
     number   = {3},
     pages    = {774--791},
-    title    = {Dimension reduction in regressions with exponential family predictors},
-    url      = {https://doi.org/10.1198/jcgs.2009.08005},
-    volume   = {18},
-    year     = {2009}
+    issn     = {1061-8600,1537-2715},
+    doi      = {10.1198/jcgs.2009.08005},
+    fjournal = {Journal of Computational and Graphical Statistics},
+    mrclass  = {62J05},
+    mrnumber = {2572637}
 }
 
 @article{CookLiChiaromonte2010,
     author    = {R. Dennis Cook and Bing Li and Francesca Chiaromonte},
-    issn      = {10170405, 19968507},
+    year      = {2010},
+    title     = {Envelope Models for Parsimonious and Efficient Multivariate Linear Regression},
     journal   = {Statistica Sinica},
+    volume    = {20},
     number    = {3},
     pages     = {927--960},
     publisher = {Institute of Statistical Science, Academia Sinica},
-    title     = {Envelope Models for Parsimonious and Efficient Multivariate Linear Regression},
+    issn      = {10170405, 19968507},
     url       = {http://www.jstor.org/stable/24309466},
-    urldate   = {2024-03-29},
-    volume    = {20},
-    year      = {2010}
+    urldate   = {2024-03-29}
 }
 
 @article{CookNachtsheim1994,
     author    = {R. Dennis Cook and Christopher J. Nachtsheim},
-    issn      = {01621459},
+    year      = {1994},
+    title     = {Reweighting to Achieve Elliptically Contoured Covariates in Regression},
     journal   = {Journal of the American Statistical Association},
+    volume    = {89},
     number    = {426},
     pages     = {592--599},
     publisher = {[American Statistical Association, Taylor \& Francis, Ltd.]},
-    title     = {Reweighting to Achieve Elliptically Contoured Covariates in Regression},
+    issn      = {01621459},
     url       = {http://www.jstor.org/stable/2290862},
-    urldate   = {2024-01-18},
-    volume    = {89},
-    year      = {1994}
+    urldate   = {2024-01-18}
 }
 
 @article{CookWeisberg1991,
     author    = {Cook, R. Dennis and Sanford Weisberg},
-    issn      = {01621459},
+    year      = {1991},
+    title     = {Sliced Inverse Regression for Dimension Reduction: Comment},
     journal   = {Journal of the American Statistical Association},
+    volume    = {86},
     number    = {414},
     pages     = {328-332},
     publisher = {[American Statistical Association, Taylor & Francis, Ltd.]},
-    title     = {Sliced Inverse Regression for Dimension Reduction: Comment},
-    url       = {http://www.jstor.org/stable/2290564},
-    volume    = {86},
-    year      = {1991}
+    issn      = {01621459},
+    url       = {http://www.jstor.org/stable/2290564}
 }
 
 @article{CoxWermuth1994,
     author    = {D. R. Cox and Nanny Wermuth},
-    issn      = {00063444},
+    year      = {1994},
+    title     = {A Note on the Quadratic Exponential Binary Distribution},
     journal   = {Biometrika},
+    volume    = {81},
     number    = {2},
     pages     = {403--408},
     publisher = {[Oxford University Press, Biometrika Trust]},
-    title     = {A Note on the Quadratic Exponential Binary Distribution},
+    issn      = {00063444},
     url       = {http://www.jstor.org/stable/2336971},
-    urldate   = {2024-04-11},
-    volume    = {81},
-    year      = {1994}
+    urldate   = {2024-04-11}
 }
 
 @book{Dai2012,
     author    = {Dai, Bin},
+    year      = {2012},
+    title     = {Multivariate {B}ernoulli distribution models},
+    pages     = {109},
+    publisher = {ProQuest LLC, Ann Arbor, MI},
+    url       = {http://gateway.proquest.com/openurl?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:dissertation&res_dat=xri:pqm&rft_dat=xri:pqdiss:3522035},
     isbn      = {978-1267-53750-8},
     mrclass   = {99-05},
     mrnumber  = {3078422},
-    note      = {Thesis (Ph.D.)--The University of Wisconsin - Madison},
-    pages     = {109},
-    publisher = {ProQuest LLC, Ann Arbor, MI},
-    title     = {Multivariate {B}ernoulli distribution models},
-    url       = {http://gateway.proquest.com/openurl?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:dissertation&res_dat=xri:pqm&rft_dat=xri:pqdiss:3522035},
-    year      = {2012}
+    note      = {Thesis (Ph.D.)--The University of Wisconsin - Madison}
 }
 
 @article{DaiDingWahba2013,
     author   = {Dai, Bin and Ding, Shilin and Wahba, Grace},
-    doi      = {10.3150/12-BEJSP10},
-    fjournal = {Bernoulli. Official Journal of the Bernoulli Society for Mathematical Statistics and Probability},
-    issn     = {1350-7265,1573-9759},
+    year     = {2013},
+    title    = {Multivariate {B}ernoulli distribution},
     journal  = {Bernoulli},
-    mrclass  = {62E15 (60E05 62G05 62H10 62H12 62J12)},
-    mrnumber = {3102559},
+    volume   = {19},
     number   = {4},
     pages    = {1465--1483},
-    title    = {Multivariate {B}ernoulli distribution},
-    url      = {https://doi.org/10.3150/12-BEJSP10},
-    volume   = {19},
-    year     = {2013}
+    issn     = {1350-7265,1573-9759},
+    doi      = {10.3150/12-BEJSP10},
+    fjournal = {Bernoulli. Official Journal of the Bernoulli Society for Mathematical Statistics and Probability},
+    mrclass  = {62E15 (60E05 62G05 62H10 62H12 62J12)},
+    mrnumber = {3102559}
 }
 
 @article{Dawid1981,
     author    = {A. P. Dawid},
-    issn      = {00063444},
+    year      = {1981},
+    title     = {Some Matrix-Variate Distribution Theory: Notational Considerations and a Bayesian Application},
     journal   = {Biometrika},
+    volume    = {68},
     number    = {1},
     pages     = {265--274},
     publisher = {[Oxford University Press, Biometrika Trust]},
-    title     = {Some Matrix-Variate Distribution Theory: Notational Considerations and a Bayesian Application},
+    issn      = {00063444},
     url       = {http://www.jstor.org/stable/2335827},
-    urldate   = {2024-01-12},
-    volume    = {68},
-    year      = {1981}
+    urldate   = {2024-01-12}
 }
 
 @article{DeAlmeidaEtAl2007,
     author  = {André L.F. {de Almeida} and Gérard Favier and João Cesar M. Mota},
-    doi     = {https://doi.org/10.1016/j.sigpro.2005.12.014},
-    issn    = {0165-1684},
+    year    = {2007},
+    title   = {PARAFAC-based unified tensor modeling for wireless communication systems with application to blind multiuser equalization},
     journal = {Signal Processing},
-    note    = {Tensor Signal Processing},
+    volume  = {87},
     number  = {2},
     pages   = {337-351},
-    title   = {PARAFAC-based unified tensor modeling for wireless communication systems with application to blind multiuser equalization},
-    url     = {https://www.sciencedirect.com/science/article/pii/S0165168406001757},
-    volume  = {87},
-    year    = {2007}
+    issn    = {0165-1684},
+    doi     = {https://doi.org/10.1016/j.sigpro.2005.12.014},
+    note    = {Tensor Signal Processing}
 }
 
-@article{DeesMandic2019,
-    author  = {Bruno Scalzo Dees and Danilo P. Mandic},
-    journal = {ArXiv},
-    title   = {A Statistically Identifiable Model for Tensor-Valued Gaussian Random Variables},
-    url     = {https://api.semanticscholar.org/CorpusID:207847615},
-    volume  = {abs/1911.02915},
-    year    = {2019}
+@misc{DeesMandic2019,
+    author        = {Bruno Scalzo Dees and Anh-Huy Phan and Danilo P. Mandic},
+    year          = {2019},
+    title         = {A Statistically Identifiable Model for Tensor-Valued Gaussian Random Variables},
+    eprint        = {1911.02915},
+    archiveprefix = {arXiv},
+    primaryclass  = {eess.SP}
 }
 
 @article{DeLathauwerCastaing2007,
     author  = {Lieven {De Lathauwer} and Joséphine Castaing},
-    doi     = {10.1016/j.sigpro.2005.12.015},
-    issn    = {0165-1684},
+    year    = {2007},
+    title   = {Tensor-based techniques for the blind separation of DS-CDMA signals},
     journal = {Signal Processing},
-    note    = {Tensor Signal Processing},
+    volume  = {87},
     number  = {2},
     pages   = {322-336},
-    title   = {Tensor-based techniques for the blind separation of DS-CDMA signals},
-    url     = {https://www.sciencedirect.com/science/article/pii/S0165168406001745},
-    volume  = {87},
-    year    = {2007}
+    issn    = {0165-1684},
+    doi     = {10.1016/j.sigpro.2005.12.015},
+    note    = {Tensor Signal Processing}
 }
 
 @article{deLeeuwMichailidis2000,
     author  = {J. de Leeuw and G. Michailidis},
-    journal = {Journal of Computational and  Graphical Statistics},
-    pages   = {26-31},
+    year    = {2000},
     title   = {Discussion article on the paper by Lange, Hunter \& Yang (2000)},
+    journal = {Journal of Computational and  Graphical Statistics},
     volume  = {9},
-    year    = {2000}
+    pages   = {26-31}
 } 
 
 @article{delPino1989,
     author  = {G. del Pino},
-    journal = {Statistical Science},
-    pages   = {394-408},
+    year    = {1989},
     title   = {The unifying role of iterative generalized least squares in statistical algorithms},
+    journal = {Statistical Science},
     volume  = {4},
-    year    = {1989}
+    pages   = {394-408}
 }
 
 @article{DingCook2014,
     author    = {Shanshan Ding and Cook, R. Dennis},
-    doi       = {10.5705/ss.2012.138},
+    year      = {2014},
+    title     = {Dimension folding PCA and PFC for matrix-valued predictors},
     journal   = {Statistica Sinica},
+    volume    = {24},
     pages     = {463--492},
     publisher = {Institute of Statistical Science},
-    title     = {Dimension folding PCA and PFC for matrix-valued predictors},
-    volume    = {24},
-    year      = {2014}
+    doi       = {10.5705/ss.2012.138}
 }
 
 @article{DingCook2015,
     author  = {Shanshan Ding and R. Dennis Cook},
-    doi     = {10.1016/j.jmva.2014.08.015},
-    issn    = {0047-259X},
-    journal = {Journal of Multivariate Analysis},
-    pages   = {216-231},
+    year    = {2015},
     title   = {Tensor sliced inverse regression},
+    journal = {Journal of Multivariate Analysis},
     volume  = {133},
-    year    = {2015}
+    pages   = {216-231},
+    issn    = {0047-259X},
+    doi     = {10.1016/j.jmva.2014.08.015}
 }
 
 @article{DrtonEtAl2020,
     author  = {Mathias Drton and Satoshi Kuriki and Peter D. Hoff},
-    journal = {The Annals of Statistics},
+    year    = {2020},
     title   = {Existence and uniqueness of the Kronecker covariance MLE},
-    url     = {https://api.semanticscholar.org/CorpusID:212718000},
-    year    = {2020}
+    journal = {The Annals of Statistics},
+    url     = {https://api.semanticscholar.org/CorpusID:212718000}
 }
 
 @article{DrydenEtAl2009,
     author    = {Ian L. Dryden and Alexey Koloydenko and Diwei Zhou},
-    doi       = {10.1214/09-AOAS249},
+    year      = {2009},
+    title     = {{Non-Euclidean statistics for covariance matrices, with applications to diffusion tensor imaging}},
     journal   = {The Annals of Applied Statistics},
+    volume    = {3},
     number    = {3},
     pages     = {1102 -- 1123},
     publisher = {Institute of Mathematical Statistics},
-    title     = {{Non-Euclidean statistics for covariance matrices, with applications to diffusion tensor imaging}},
-    volume    = {3},
-    year      = {2009}
+    doi       = {10.1214/09-AOAS249}
 }
 
 @misc{Dutilleul1990,
-    address = {Department of Mathematics. Universit\'e catholique de Louvian, Louvain-la-Neuve, Belgium},
     author  = {Pierre Dutilleul},
-    note    = {Unpublished D.Sc. Dissertation},
+    year    = {1990},
     title   = {Apport en analyse spectrale d'un p\'eriodogramme modifi\'e et mod\'elisation des s\'eries chronologiques avec r\'ep\'etitions en vue de leur comparaison en fr\'equence},
-    year    = {1990}
+    address = {Department of Mathematics. Universit\'e catholique de Louvian, Louvain-la-Neuve, Belgium},
+    note    = {Unpublished D.Sc. Dissertation}
 }
 
 @article{Dutilleul1999,
     author    = {Pierre Dutilleul},
-    doi       = {10.1080/00949659908811970},
+    year      = {1999},
+    title     = {The mle algorithm for the matrix normal distribution},
     journal   = {Journal of Statistical Computation and Simulation},
+    volume    = {64},
     number    = {2},
     pages     = {105-123},
     publisher = {Taylor & Francis},
-    title     = {The mle algorithm for the matrix normal distribution},
-    volume    = {64},
-    year      = {1999}
+    doi       = {10.1080/00949659908811970}
 }
 
 @book{Eaton2007,
     author    = {Morris L. Eaton},
-    publisher = {Institute of Mathematical Statistics},
-    series    = {Lecture Notes--Monograph Series, Volume 53},
+    year      = {2007},
     title     = {Multivariate Statistics: A Vector Space Approach},
+    publisher = {Institute of Mathematical Statistics},
     url       = {https://projecteuclid.org/euclid.lnms/1196285102},
-    year      = {2007}
+    series    = {Lecture Notes--Monograph Series, Volume 53}
 }
 
 @article{EdelmanEtAl1998,
     author  = {Edelman, A. and Arias, T. and Smith, S.},
-    doi     = {10.1137/S0895479895290954},
-    eprint  = {https://doi.org/10.1137/S0895479895290954},
+    year    = {1998},
+    title   = {The Geometry of Algorithms with Orthogonality Constraints},
     journal = {SIAM Journal on Matrix Analysis and Applications},
+    volume  = {20},
     number  = {2},
     pages   = {303-353},
-    title   = {The Geometry of Algorithms with Orthogonality Constraints},
-    url     = {https://doi.org/10.1137/S0895479895290954},
-    volume  = {20},
-    year    = {1998}
+    doi     = {10.1137/S0895479895290954}
 }
 
 @article{Einstein1916,
     author  = {Einstein, Albert},
-    doi     = {10.1002/andp.19163540702},
+    year    = {1916},
+    title   = {Die Grundlage der allgemeinen Relativitätstheorie},
     journal = {Annalen der Physik},
+    volume  = {354},
     number  = {7},
     pages   = {769-822},
-    title   = {Die Grundlage der allgemeinen Relativitätstheorie},
-    volume  = {354},
-    year    = {1916}
+    doi     = {10.1002/andp.19163540702}
 }
 
 @article{Fan1993,
     author  = {Jianqing Fan},
-    journal = {Annals of Statistics},
-    pages   = {196-216},
+    year    = {1993},
     title   = {Local Linear Regression Smoothers and Their Minimax Efficiencies},
-    url     = {https://api.semanticscholar.org/CorpusID:9375835},
+    journal = {Annals of Statistics},
     volume  = {21},
-    year    = {1993}
+    pages   = {196-216},
+    url     = {https://api.semanticscholar.org/CorpusID:9375835}
 }
 
 @article{FanGijbels1992,
     author    = {Jianqing Fan and Irene Gijbels},
-    issn      = {00905364},
+    year      = {1992},
+    title     = {Variable Bandwidth and Local Linear Regression Smoothers},
     journal   = {The Annals of Statistics},
+    volume    = {20},
     number    = {4},
     pages     = {2008--2036},
     publisher = {Institute of Mathematical Statistics},
-    title     = {Variable Bandwidth and Local Linear Regression Smoothers},
+    issn      = {00905364},
     url       = {http://www.jstor.org/stable/2242378},
-    urldate   = {2024-01-25},
-    volume    = {20},
-    year      = {1992}
+    urldate   = {2024-01-25}
 }
 
 @article{FanLi2001,
     author    = {Jianqing Fan and Runze Li},
-    doi       = {10.1198/016214501753382273},
-    eprint    = {https://doi.org/10.1198/016214501753382273},
+    year      = {2001},
+    title     = {Variable Selection via Nonconcave Penalized Likelihood and its Oracle Properties},
     journal   = {Journal of the American Statistical Association},
+    volume    = {96},
     number    = {456},
     pages     = {1348-1360},
     publisher = {Taylor & Francis},
-    title     = {Variable Selection via Nonconcave Penalized Likelihood and its Oracle Properties},
-    url       = {https://doi.org/10.1198/016214501753382273},
-    volume    = {96},
-    year      = {2001}
+    doi       = {10.1198/016214501753382273}
 }
 
 @article{FertlBura2022a,
     author   = {Fertl, Lukas and Bura, Efstathia},
-    doi      = {10.3150/21-bej1402},
-    fjournal = {Bernoulli. Official Journal of the Bernoulli Society for Mathematical Statistics and Probability},
-    issn     = {1350-7265,1573-9759},
+    year     = {2022},
+    title    = {Conditional variance estimator for sufficient dimension reduction},
     journal  = {Bernoulli},
-    mrclass  = {62G08},
-    mrnumber = {4411514},
+    volume   = {28},
     number   = {3},
     pages    = {1862--1891},
-    title    = {Conditional variance estimator for sufficient dimension reduction},
-    url      = {https://doi.org/10.3150/21-bej1402},
-    volume   = {28},
-    year     = {2022}
+    issn     = {1350-7265,1573-9759},
+    doi      = {10.3150/21-bej1402},
+    fjournal = {Bernoulli. Official Journal of the Bernoulli Society for Mathematical Statistics and Probability},
+    mrclass  = {62G08},
+    mrnumber = {4411514}
 }
 
 @article{FertlBura2022b,
     author   = {Fertl, Lukas and Bura, Efstathia},
-    doi      = {10.1214/22-EJS1994},
-    fjournal = {Electronic Journal of Statistics},
-    issn     = {1935-7524},
+    year     = {2022},
+    title    = {The ensemble conditional variance estimator for sufficient dimension reduction},
     journal  = {Electron. J. Stat.},
-    mrclass  = {62G07 (62G08)},
-    mrnumber = {4390504},
+    volume   = {16},
     number   = {1},
     pages    = {1595--1634},
-    title    = {The ensemble conditional variance estimator for sufficient dimension reduction},
-    url      = {https://doi.org/10.1214/22-EJS1994},
-    volume   = {16},
-    year     = {2022}
+    issn     = {1935-7524},
+    doi      = {10.1214/22-EJS1994},
+    fjournal = {Electronic Journal of Statistics},
+    mrclass  = {62G07 (62G08)},
+    mrnumber = {4390504}
 }
 
 @inproceedings{FischerIgel2012,
-    address   = {Berlin, Heidelberg},
     author    = {Fischer, Asja and Igel, Christian},
-    booktitle = {Progress in Pattern Recognition, Image Analysis, Computer Vision, and Applications},
-    editor    = {Alvarez, Luis and Mejail, Marta and Gomez, Luis and Jacobo, Julio},
-    isbn      = {978-3-642-33275-3},
+    year      = {2012},
+    title     = {An Introduction to Restricted Boltzmann Machines},
     pages     = {14--36},
     publisher = {Springer Berlin Heidelberg},
-    title     = {An Introduction to Restricted Boltzmann Machines},
-    year      = {2012}
+    address   = {Berlin, Heidelberg},
+    booktitle = {Progress in Pattern Recognition, Image Analysis, Computer Vision, and Applications},
+    editor    = {Alvarez, Luis and Mejail, Marta and Gomez, Luis and Jacobo, Julio},
+    isbn      = {978-3-642-33275-3}
 }
 
 @article{Fisher1922,
     author    = {R. A. Fisher},
-    issn      = {02643952},
+    year      = {1922},
+    title     = {On the Mathematical Foundations of Theoretical Statistics},
     journal   = {Philosophical Transactions of the Royal Society of London. Series A, Containing Papers of a Mathematical or Physical Character},
+    volume    = {222},
     number    = {},
     pages     = {309--368},
     publisher = {The Royal Society},
-    title     = {On the Mathematical Foundations of Theoretical Statistics},
+    issn      = {02643952},
     url       = {http://www.jstor.org/stable/91208},
-    urldate   = {2024-01-22},
-    volume    = {222},
-    year      = {1922}
+    urldate   = {2024-01-22}
 }
 
-@misc{friedbergEtAl2020,
-    author       = {Friedberg, Rina and Tibshirani, Julie and Athey, Susan and Wager, Stefan},
-    howpublished = {arXiv:1807.11408 [cs, econ, math, stat]},
-    note         = {\url{http://arxiv.org/abs/1807.11408}},
-    title        = {Local {L}inear {F}orests},
-    urldate      = {2021-03-08},
-    year         = {2020}
+@misc{FriedbergEtAl2020,
+    author        = {Rina Friedberg and Julie Tibshirani and Susan Athey and Stefan Wager},
+    year          = {2020},
+    title         = {{Local Linear Forests}},
+    eprint        = {1807.11408},
+    archiveprefix = {arXiv},
+    primaryclass  = {stat.ML}
 }
 
 @article{Friedman1991,
     author    = {Jerome H. Friedman},
-    issn      = {00905364},
+    year      = {1991},
+    title     = {Multivariate Adaptive Regression Splines},
     journal   = {The Annals of Statistics},
+    volume    = {19},
     number    = {1},
     pages     = {1--67},
     publisher = {Institute of Mathematical Statistics},
-    title     = {Multivariate Adaptive Regression Splines},
-    url       = {http://www.jstor.org/stable/2241837},
-    volume    = {19},
-    year      = {1991}
+    issn      = {00905364},
+    url       = {http://www.jstor.org/stable/2241837}
 }
 
 @article{FukumizuEtAl2009,
     author    = {Fukumizu, Kenji and Bach, Francis R. and Jordan, Michael I.},
-    doi       = {10.1214/08-AOS637},
-    fjournal  = {The Annals of Statistics},
+    year      = {2009},
+    title     = {Kernel dimension reduction in regression},
     journal   = {Ann. Statist.},
-    month     = {08},
+    volume    = {37},
     number    = {4},
     pages     = {1871--1905},
     publisher = {The Institute of Mathematical Statistics},
-    title     = {Kernel dimension reduction in regression},
-    volume    = {37},
-    year      = {2009}
+    doi       = {10.1214/08-AOS637},
+    fjournal  = {The Annals of Statistics},
+    month     = {08}
 }
 
 @misc{GhojoghEtAl2021,
-    archiveprefix = {arXiv},
     author        = {Benyamin Ghojogh and Ali Ghodsi and Fakhri Karray and Mark Crowley},
-    doi           = {10.48550/arXiv.2110.09620},
-    eprint        = {2110.09620},
-    primaryclass  = {stat.ME},
+    year          = {2021},
     title         = {Sufficient Dimension Reduction for High-Dimensional Regression and Low-Dimensional Embedding: Tutorial and Survey},
-    year          = {2021}
+    eprint        = {2110.09620},
+    archiveprefix = {arXiv},
+    primaryclass  = {stat.ME}
 }
 
 @article{GirkaEtAl2024,
     author  = {Fabien Girka and Arnaud Gloaguen and Laurent {Le Brusquet} and Violetta Zujovic and Arthur Tenenhaus},
-    doi     = {10.1016/j.inffus.2023.102045},
-    issn    = {1566-2535},
-    journal = {Information Fusion},
+    year    = {2024},
     title   = {Tensor generalized canonical correlation analysis},
+    journal = {Information Fusion},
     volume  = {102},
-    year    = {2024}
+    issn    = {1566-2535},
+    doi     = {10.1016/j.inffus.2023.102045}
 }
 
 @article{GlobersonTishby2003,
     author  = {Amir Globerson and Naftali Tishby},
-    journal = {Journal of Machine Learning Research},
+    year    = {2003},
     title   = {Sufficient Dimensionality Reduction},
-    url     = {https://api.semanticscholar.org/CorpusID:5095858},
+    journal = {Journal of Machine Learning Research},
     volume  = {3},
-    year    = {2003}
+    url     = {https://api.semanticscholar.org/CorpusID:5095858}
 }
 
 @book{GolubVanLoanl996,
     author    = {Golub, Gene H. and Van Loan, Charles F.},
-    edition   = {Third},
-    publisher = {The Johns Hopkins University Press},
+    year      = {1996},
     title     = {Matrix Computations},
-    year      = {1996}
+    publisher = {The Johns Hopkins University Press},
+    edition   = {Third}
 }
 
 @book{GoodfellowEtAl2016,
     author    = {Ian Goodfellow and Yoshua Bengio and Aaron Courville},
-    publisher = {MIT Press},
+    year      = {2016},
     title     = {Deep Learning},
-    url       = {\url{http://www.deeplearningbook.org}},
-    year      = {2016}
+    publisher = {MIT Press},
+    url       = {\url{http://www.deeplearningbook.org}}
 }
 
 @article{Green1984,
     author    = {P. J. Green},
-    issn      = {00359246},
+    year      = {1984},
+    title     = {Iteratively Reweighted Least Squares for Maximum Likelihood Estimation, and some Robust and Resistant Alternatives},
     journal   = {Journal of the Royal Statistical Society. Series B (Methodological)},
+    volume    = {46},
     number    = {2},
     pages     = {149--192},
     publisher = {[Royal Statistical Society, Wiley]},
-    title     = {Iteratively Reweighted Least Squares for Maximum Likelihood Estimation, and some Robust and Resistant Alternatives},
-    url       = {http://www.jstor.org/stable/2345503},
-    volume    = {46},
-    year      = {1984}
+    issn      = {00359246},
+    url       = {http://www.jstor.org/stable/2345503}
 }
 
 @article{GreenewaldHero2014,
     author  = {Kristjan H. Greenewald and Alfred O. Hero},
-    journal = {IEEE Transactions on Signal Processing},
-    pages   = {6368-6378},
+    year    = {2014},
     title   = {Robust Kronecker Product PCA for Spatio-Temporal Covariance Estimation},
-    url     = {https://api.semanticscholar.org/CorpusID:15582097},
+    journal = {IEEE Transactions on Signal Processing},
     volume  = {63},
-    year    = {2014}
+    pages   = {6368-6378},
+    url     = {https://api.semanticscholar.org/CorpusID:15582097}
 }
 
 @book{Gurney1997,
-    address   = {USA},
     author    = {Gurney, Kevin},
-    isbn      = {1857286731},
-    publisher = {Taylor \& Francis, Inc.},
+    year      = {1997},
     title     = {An Introduction to Neural Networks},
-    year      = {1997}
+    publisher = {Taylor \& Francis, Inc.},
+    address   = {USA},
+    isbn      = {1857286731}
 }
 
 @article{Habeck2014,
     author    = {Habeck, Michael},
-    doi       = {10.1103/PhysRevE.89.052113},
-    issue     = {5},
+    year      = {2014},
+    title     = {Bayesian approach to inverse statistical mechanics},
     journal   = {Phys. Rev. E},
-    month     = {May},
-    numpages  = {7},
+    volume    = {89},
     pages     = {052113},
     publisher = {American Physical Society},
-    title     = {Bayesian approach to inverse statistical mechanics},
-    volume    = {89},
-    year      = {2014}
+    doi       = {10.1103/PhysRevE.89.052113},
+    issue     = {5},
+    month     = {May},
+    numpages  = {7}
 }
 
 @misc{HajriEtAl2017,
     author    = {Hajri, Hatem and Said, Salem and Berthoumieu, Yannick},
-    doi       = {10.1007/978-3-319-68445-1_80},
+    year      = {2017},
+    title     = {Maximum Likelihood Estimators on Manifolds},
     journal   = {Lecture Notes in Computer Science},
     pages     = {692-700},
     publisher = {Springer International Publishing},
-    title     = {Maximum Likelihood Estimators on Manifolds},
-    year      = {2017}
+    doi       = {10.1007/978-3-319-68445-1_80}
 }
 
 @article{HallLi1993,
     author   = {Hall, P. and Li, KC.},
+    year     = {1993},
+    title    = {On almost Linearity of Low Dimensional Projections from High Dimensional Data},
     journal  = {Annals of Statistics},
-    language = {English},
+    volume   = {21},
     number   = {2},
     pages    = { 867-889},
-    title    = {On almost Linearity of Low Dimensional Projections from High Dimensional Data},
-    volume   = {21},
-    year     = {1993}
+    language = {English}
 }
 
 @article{HaoEtAl2021,
     author  = {Botao Hao and Boxiang Wang and Pengyuan Wang and Jingfei Zhang and Jian Yang and Will Wei Sun},
+    year    = {2021},
+    title   = {Sparse Tensor Additive Regression},
     journal = {Journal of Machine Learning Research},
+    volume  = {22},
     number  = {64},
     pages   = {1--43},
-    title   = {Sparse Tensor Additive Regression},
-    url     = {http://jmlr.org/papers/v22/19-769.html},
-    volume  = {22},
-    year    = {2021}
+    url     = {http://jmlr.org/papers/v22/19-769.html}
 }
 
 @book{Harville1997,
-    address   = {New York},
     author    = {David A. Harville},
-    chapter   = {15},
-    edition   = {1},
-    publisher = {Springer-Verlag},
+    year      = {1997},
     title     = {Matrix Algebra From a Statistician's Perspective},
-    year      = {1997}
+    publisher = {Springer-Verlag},
+    address   = {New York},
+    chapter   = {15},
+    edition   = {1}
 }
 
 @book{HastieTibshirani1990,
     author    = {Hastie, Trevor John and Tibshirani, Robert J.},
+    year      = {1990},
+    title     = {Generalized additive models},
+    volume    = {43},
+    pages     = {xvi+335},
+    publisher = {Chapman and Hall, Ltd., London},
     isbn      = {0-412-34390-8},
     mrclass   = {62J02 (62-07 62G05 62J20)},
     mrnumber  = {1082147},
-    pages     = {xvi+335},
-    publisher = {Chapman and Hall, Ltd., London},
-    series    = {Monographs on Statistics and Applied Probability},
-    title     = {Generalized additive models},
-    volume    = {43},
-    year      = {1990}
+    series    = {Monographs on Statistics and Applied Probability}
 }
 
 @article{HillarLim2013,
+    author     = {Hillar, Christopher J. and Lim, Lek\-Heng},
+    year       = {2013},
+    title      = {Most Tensor Problems Are NP-Hard},
+    journal    = {J. ACM},
+    volume     = {60},
+    number     = {6},
+    publisher  = {Association for Computing Machinery},
+    issn       = {0004-5411},
+    doi        = {10.1145/2512329},
     address    = {New York, NY, USA},
     articleno  = {45},
-    author     = {Hillar, Christopher J. and Lim, Lek\-Heng},
-    doi        = {10.1145/2512329},
-    issn       = {0004-5411},
     issue_date = {November 2013},
-    journal    = {J. ACM},
-    number     = {6},
-    numpages   = {39},
-    publisher  = {Association for Computing Machinery},
-    title      = {Most Tensor Problems Are NP-Hard},
-    url        = {https://doi.org/10.1145/2512329},
-    volume     = {60},
-    year       = {2013}
+    numpages   = {39}
 }
 
 @article{Hinton2002,
     author  = {Hinton, Geoffrey E.},
-    doi     = {10.1162/089976602760128018},
-    issn    = {0899-7667},
+    year    = {2002},
+    title   = {{Training Products of Experts by Minimizing Contrastive Divergence}},
     journal = {Neural Computation},
-    month   = {08},
+    volume  = {14},
     number  = {8},
     pages   = {1771--1800},
-    title   = {{Training Products of Experts by Minimizing Contrastive Divergence}},
-    volume  = {14},
-    year    = {2002}
+    issn    = {0899-7667},
+    doi     = {10.1162/089976602760128018},
+    month   = {08}
 }
 
 @misc{Hinton2012,
     author = {Hinton, Geoffrey E.},
-    note   = {Coursera Lecture 6 - Online; accessed Jan 18, 2024},
+    year   = {2012},
     title  = {{Neural Networks for Machine Learning}},
     url    = {www.cs.toronto.edu/~hinton/coursera/lecture6/lec6.pdf},
-    year   = {2012}
+    note   = {Coursera Lecture 6 - Online; accessed Jan 18, 2024}
 }
 
 @article{Hoff2011,
     author    = {Peter D. Hoff},
-    doi       = {10.1214/11-BA606},
+    year      = {2011},
+    title     = {{Separable covariance arrays via the Tucker product, with applications to multivariate relational data}},
     journal   = {Bayesian Analysis},
-    keywords  = {Gaussian, matrix normal, multiway data, network, tensor, Tucker decomposition},
+    volume    = {6},
     number    = {2},
     pages     = {179 -- 196},
     publisher = {International Society for Bayesian Analysis},
-    title     = {{Separable covariance arrays via the Tucker product, with applications to multivariate relational data}},
-    volume    = {6},
-    year      = {2011}
+    doi       = {10.1214/11-BA606},
+    keywords  = {Gaussian, matrix normal, multiway data, network, tensor, Tucker decomposition}
 }
 
 @article{Hoff2015,
     author    = {Peter D. Hoff},
-    doi       = {10.1214/15-AOAS839},
+    year      = {2015},
+    title     = {{Multilinear tensor regression for longitudinal relational data}},
     journal   = {The Annals of Applied Statistics},
-    keywords  = {Array normal, Bayesian inference, event data, international relations, network, Tucker product, vector autoregression},
+    volume    = {9},
     number    = {3},
     pages     = {1169 -- 1193},
     publisher = {Institute of Mathematical Statistics},
-    title     = {{Multilinear tensor regression for longitudinal relational data}},
-    volume    = {9},
-    year      = {2015}
+    doi       = {10.1214/15-AOAS839},
+    keywords  = {Array normal, Bayesian inference, event data, international relations, network, Tucker product, vector autoregression}
 }
 
 @article{Hornik1991,
     author  = {Hornik, Kurt},
-    issn    = {0893-6080},
+    year    = {1991},
+    title   = {Approximation capabilities of multilayer feedforward networks},
     journal = {Neural Networks},
-    note    = {\url{https://doi.org/10.1016/0893-6080(91)90009-T}},
+    volume  = {4},
     number  = {2},
     pages   = {251-257},
-    title   = {Approximation capabilities of multilayer feedforward networks},
-    volume  = {4},
-    year    = {1991}
+    issn    = {0893-6080},
+    note    = {\url{https://doi.org/10.1016/0893-6080(91)90009-T}}
 }
 
 @article{HuLeeWang2022,
     author    = {Hu, Jiaxin and Lee, Chanwoo and Wang, Miaoyan},
-    doi       = {10.1080/10618600.2021.1978471},
+    year      = {2022},
+    title     = {Generalized Tensor Decomposition With Features on Multiple Modes},
     journal   = {Journal of Computational and Graphical Statistics},
+    volume    = {31},
     number    = {1},
     pages     = {204-218},
     publisher = {Taylor \& Francis},
-    title     = {Generalized Tensor Decomposition With Features on Multiple Modes},
-    volume    = {31},
-    year      = {2022}
+    doi       = {10.1080/10618600.2021.1978471}
 }
 
 @article{Ising1925,
     author  = {Ising, Ernst},
-    doi     = {10.1007/BF02980577},
-    issn    = {0044-3328},
+    year    = {1925},
+    title   = {{Beitrag zur Theorie des Ferromagnetismus}},
     journal = {Zeitschrift f\"ur Physik},
-    month   = {2},
+    volume  = {31},
     number  = {1},
     pages   = {253-258},
-    title   = {{Beitrag zur Theorie des Ferromagnetismus}},
-    volume  = {31},
-    year    = {1925}
+    issn    = {0044-3328},
+    doi     = {10.1007/BF02980577},
+    month   = {2}
 }
 
 @book{JamesEtAl2021,
     author    = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
+    year      = {2021},
+    title     = {An introduction to statistical learning---with applications in {R}},
+    pages     = {xv+607},
+    publisher = {Springer, New York},
     doi       = {10.1007/978-1-0716-1418-1},
     edition   = {Second},
     isbn      = {978-1-0716-1418-1},
     mrclass   = {62-01 (62-04 62H30 62Jxx 62M45 62N01)},
     mrnumber  = {4309209},
-    pages     = {xv+607},
-    publisher = {Springer, New York},
-    series    = {Springer Texts in Statistics},
-    title     = {An introduction to statistical learning---with applications in {R}},
-    url       = {https://doi.org/10.1007/978-1-0716-1418-1},
-    year      = {2021}
+    series    = {Springer Texts in Statistics}
 }
 
 @incollection{JennyHaselmayerKapla2021,
-    address   = {London},
     author    = {Jenny, Marcelo and Haselmayer, Martin and Kapla, Daniel},
+    year      = {2021},
+    title     = {Measuring incivility in parliamentary debates : validating a sentiment analysis procedure with calls to order in the Austrian Parliament},
+    pages     = {1--11},
+    publisher = {Routledge},
+    address   = {London},
     booktitle = {Political Incivility in the Parliamentary, Electoral and Media Arena : Crossing Boundaries},
     editor    = {Walter, Annemarie S.},
     isbn      = {978-0-367-46273-4},
-    pages     = {1--11},
-    publisher = {Routledge},
-    series    = {Routledge studies on political parties and party systems},
-    title     = {Measuring incivility in parliamentary debates : validating a sentiment analysis procedure with calls to order in the Austrian Parliament},
-    year      = {2021}
+    series    = {Routledge studies on political parties and party systems}
 }
 
 @book{JohnsonEtAl1997,
     author    = {Johnson, Norman L. and Kotz, Samuel and Balakrishnan, N.},
+    year      = {1997},
+    title     = {{Discrete Multivariate Distributions}},
+    pages     = {xxii+299},
+    publisher = {John Wiley \& Sons, Inc., New York},
     isbn      = {0-471-12844-9},
     mrclass   = {62E15 (60C05 60E05 62H05)},
     mrnumber  = {1429617},
     note      = {A Wiley-Interscience Publication},
-    pages     = {xxii+299},
-    publisher = {John Wiley \& Sons, Inc., New York},
-    series    = {Wiley Series in Probability and Statistics: Applied Probability and Statistics},
-    title     = {{Discrete Multivariate Distributions}},
-    year      = {1997}
+    series    = {Wiley Series in Probability and Statistics: Applied Probability and Statistics}
 }
 
 @article{Jolliffe1982,
     author    = {Ian T. Jolliffe},
-    issn      = {00359254, 14679876},
+    year      = {1982},
+    title     = {A Note on the Use of Principal Components in Regression},
     journal   = {Journal of the Royal Statistical Society. Series C (Applied Statistics)},
+    volume    = {31},
     number    = {3},
     pages     = {300--303},
     publisher = {[Wiley, Royal Statistical Society]},
-    title     = {A Note on the Use of Principal Components in Regression},
-    url       = {http://www.jstor.org/stable/2348005},
-    volume    = {31},
-    year      = {1982}
+    issn      = {00359254, 14679876},
+    url       = {http://www.jstor.org/stable/2348005}
 }
 
 @article{JungEtAl2019,
     author    = {Sungkyu Jung and Jeongyoun Ahn and Yongho Jeon},
-    doi       = {10.1080/10618600.2019.1568014},
+    year      = {2019},
+    title     = {Penalized Orthogonal Iteration for Sparse Estimation of Generalized Eigenvalue Problem},
     journal   = {Journal of Computational and Graphical Statistics},
+    volume    = {28},
     number    = {3},
     pages     = {710-721},
     publisher = {Taylor & Francis},
-    title     = {Penalized Orthogonal Iteration for Sparse Estimation of Generalized Eigenvalue Problem},
-    volume    = {28},
-    year      = {2019}
+    doi       = {10.1080/10618600.2019.1568014}
 }
 
 @book{Kaltenbaeck2021,
     author    = {Kaltenb\"ack, Michael},
+    year      = {2021},
+    title     = {Aufbau Analysis},
+    publisher = {Heldermann Verlag},
     edition   = {27},
     isbn      = {978-3-88538-127-3},
-    publisher = {Heldermann Verlag},
-    series    = {Berliner Studienreihe zur Mathematik},
-    title     = {Aufbau Analysis},
-    year      = {2021}
+    series    = {Berliner Studienreihe zur Mathematik}
 }
 
 @article{Kapla2019,
     author = {Kapla, Daniel},
-    title  = {Comparison of Different Word Embeddings and Neural Network Types for Sentiment Analysis of German Political Speeches},
-    year   = {2019}
+    year   = {2019},
+    title  = {Comparison of Different Word Embeddings and Neural Network Types for Sentiment Analysis of German Political Speeches}
 }
 
 @article{KaplaFertlBura2022,
     author   = {Kapla, Daniel and Fertl, Lukas and Bura, Efstathia},
+    year     = {2022},
+    title    = {Fusing sufficient dimension reduction with neural networks},
+    journal  = {Comput. Statist. Data Anal.},
+    volume   = {168},
+    pages    = {Paper No. 107390, 20},
+    issn     = {0167-9473,1872-7352},
     doi      = {10.1016/j.csda.2021.107390},
     fjournal = {Computational Statistics \& Data Analysis},
-    issn     = {0167-9473,1872-7352},
-    journal  = {Comput. Statist. Data Anal.},
     mrclass  = {99-01},
-    mrnumber = {4343643},
-    pages    = {Paper No. 107390, 20},
-    title    = {Fusing sufficient dimension reduction with neural networks},
-    url      = {https://doi.org/10.1016/j.csda.2021.107390},
-    volume   = {168},
-    year     = {2022}
+    mrnumber = {4343643}
 }
 
-@misc{KingmaWelling2019,
-    author       = {Kingma, Diederik P. and Welling, Max},
-    howpublished = {arXiv:1906.02691 [cs.LG]},
-    note         = {\url{http://arxiv.org/abs/1906.02691}},
-    title        = {An {I}ntroduction to {V}ariational {A}utoencoders},
-    year         = 2019
+@article{KingmaWelling2019,
+    author    = {Kingma, Diederik P. and Welling, Max},
+    year      = {2019},
+    title     = {An Introduction to Variational Autoencoders},
+    journal   = {Foundations and Trends® in Machine Learning},
+    volume    = {12},
+    number    = {4},
+    pages     = {307--392},
+    publisher = {Now Publishers},
+    issn      = {1935-8245},
+    doi       = {10.1561/2200000056}
 }
 
 @inproceedings{KofidisRegalia2005,
     author = {Eleftherios Kofidis and Phillip A. Regalia},
+    year   = {2005},
     title  = {Tensor Approximation and Signal Processing Applications},
-    url    = {https://api.semanticscholar.org/CorpusID:13667742},
-    year   = {2005}
+    url    = {https://api.semanticscholar.org/CorpusID:13667742}
 }
 
 @article{Kolda2006,
     author = {Kolda, Tamara Gibson},
+    year   = {2006},
+    title  = {Multilinear operators for higher-order decompositions},
     doi    = {10.2172/923081},
     month  = {4},
     place  = {United States},
-    title  = {Multilinear operators for higher-order decompositions.},
-    type   = {Technical Report},
-    url    = {https://www.osti.gov/biblio/923081},
-    year   = {2006}
+    type   = {Technical Report}
 }
 
 @article{KoldaBader2009,
     author  = {Kolda, Tamara G. and Bader, Brett W.},
-    doi     = {10.1137/07070111X},
+    year    = {2009},
+    title   = {Tensor Decompositions and Applications},
     journal = {SIAM Review},
+    volume  = {51},
     number  = {3},
     pages   = {455-500},
-    title   = {Tensor Decompositions and Applications},
-    volume  = {51},
-    year    = {2009}
+    doi     = {10.1137/07070111X}
 }
 
 @book{KolloVonRosen2005,
     author    = {Kollo, T\~onu and von Rosen, Dietrich},
+    year      = {2005},
+    title     = {Advanced Multivariate Statistics with Matrices},
+    publisher = {Springer Dordrecht},
     doi       = {10.1007/1-4020-3419-9},
     editor    = {Hazewinkel,  M.},
-    isbn      = {978-1-4020-3419-0},
-    publisher = {Springer Dordrecht},
-    title     = {Advanced Multivariate Statistics with Matrices},
-    year      = {2005}
+    isbn      = {978-1-4020-3419-0}
 }
 
 @inproceedings{KongEtAl2005,
     author    = {Hui Kong and Xuchun Li and Lei Wang and Earn Khwang Teoh and Jian-Gang Wang and R. Venkateswarlu},
-    booktitle = {Proceedings. 2005 IEEE International Joint Conference on Neural Networks, 2005.},
-    doi       = {10.1109/IJCNN.2005.1555814},
-    issn      = {2161-4393},
-    number    = {},
-    pages     = {108-113},
+    year      = {2005},
     title     = {Generalized 2D principal component analysis},
     volume    = {1},
-    year      = {2005}
+    number    = {},
+    pages     = {108-113},
+    issn      = {2161-4393},
+    doi       = {10.1109/IJCNN.2005.1555814},
+    booktitle = {Proceedings. 2005 IEEE International Joint Conference on Neural Networks, 2005.}
 }
 
 @article{Kramer1991,
     author  = {Kramer, Mark A.},
+    year    = {1991},
+    title   = {Nonlinear principal component analysis using autoassociative neural networks},
     journal = {AIChE Journal},
-    note    = {\url{https://doi.org/10.1002/aic.690370209}},
+    volume  = {37},
     number  = {2},
     pages   = {233-243},
-    title   = {Nonlinear principal component analysis using autoassociative neural networks},
-    volume  = {37},
-    year    = {1991}
+    doi     = {10.1002/aic.690370209}
 }
 
 @book{Kroonenberg2008,
-    address   = {New York},
     author    = {Kroonenberg, Pieter M.},
-    doi       = {10.1002/9780470238004},
-    isbn      = {9780470238004},
-    publisher = {John Wiley \& Sons, Ltd},
+    year      = {2008},
     title     = {Applied Multiway Data Analysis},
-    year      = {2008}
+    publisher = {John Wiley \& Sons, Ltd},
+    doi       = {10.1002/9780470238004},
+    address   = {New York},
+    isbn      = {9780470238004}
 }
 
 @book{Kusolitsch2011,
     author    = {Kusolitsch, Norbert},
+    year      = {2011},
+    title     = {{M}a\ss{}- und {W}ahrscheinlichkeitstheorie},
+    publisher = {Springer Vienna},
     doi       = {10.1007/978-3-7091-0685-3},
     isbn      = {978-3-7091-0684-6},
-    publisher = {Springer Vienna},
     series    = {Springer-Lehrbuch},
-    subtitle  = {{E}ine {E}inf{\"u}hrung},
-    title     = {{M}a\ss{}- und {W}ahrscheinlichkeitstheorie},
-    year      = {2011}
+    subtitle  = {{E}ine {E}inf{\"u}hrung}
 }
 
 @article{LandgrafLee2020,
     author  = {Andrew J. Landgraf and Yoonkyung Lee},
-    doi     = {10.1016/j.jmva.2020.104668},
-    issn    = {0047-259X},
-    journal = {Journal of Multivariate Analysis},
-    pages   = {104668},
+    year    = {2020},
     title   = {Dimensionality reduction for binary data through the projection of natural parameters},
+    journal = {Journal of Multivariate Analysis},
     volume  = {180},
-    year    = {2020}
+    pages   = {104668},
+    issn    = {0047-259X},
+    doi     = {10.1016/j.jmva.2020.104668}
 }
 
 @book{Lauritzen1996,
     author    = {Lauritzen, Steffen L},
+    year      = {1996},
+    title     = {{Graphical Models}},
+    publisher = {Oxford University Press},
     doi       = {10.1093/oso/9780198522195.001.0001},
     isbn      = {9780198522195},
-    month     = {05},
-    publisher = {Oxford University Press},
-    title     = {{Graphical Models}},
-    year      = {1996}
+    month     = {05}
 }
 
 @article{LauritzenRichardson2002,
     author    = {Steffen L. Lauritzen and Thomas S. Richardson},
-    issn      = {13697412, 14679868},
+    year      = {2002},
+    title     = {Chain Graph Models and Their Causal Interpretations},
     journal   = {Journal of the Royal Statistical Society. Series B (Statistical Methodology)},
+    volume    = {64},
     number    = {3},
     pages     = {321--361},
     publisher = {[Royal Statistical Society, Wiley]},
-    title     = {Chain Graph Models and Their Causal Interpretations},
+    issn      = {13697412, 14679868},
     url       = {http://www.jstor.org/stable/3088778},
-    urldate   = {2024-01-20},
-    volume    = {64},
-    year      = {2002}
+    urldate   = {2024-01-20}
 }
 
 @article{LeBihanEtAl2001,
     author  = {Le Bihan, Denis and Mangin, Jean-Fran\c{c}ois and Poupon, Cyril and Clark, Chris A. and Pappata, Sabina and Molko, Nicolas and Chabriat, Hughes},
-    doi     = {https://doi.org/10.1002/jmri.1076},
+    year    = {2001},
+    title   = {Diffusion tensor imaging: Concepts and applications},
     journal = {Journal of Magnetic Resonance Imaging},
+    volume  = {13},
     number  = {4},
     pages   = {534-546},
-    title   = {Diffusion tensor imaging: Concepts and applications},
-    url     = {https://onlinelibrary.wiley.com/doi/abs/10.1002/jmri.1076},
-    volume  = {13},
-    year    = {2001}
+    doi     = {https://doi.org/10.1002/jmri.1076}
 }
 
 @book{Lee2012,
     author    = {Lee, John M.},
-    doi       = {10.1007/978-1-4419-9982-5},
+    year      = {2012},
+    title     = {Introduction to Smooth Manifolds},
     journal   = {Graduate Texts in Mathematics},
     publisher = {Springer New York},
-    title     = {Introduction to Smooth Manifolds},
-    year      = {2012}
+    doi       = {10.1007/978-1-4419-9982-5}
 }
 
 @book{Lee2018,
     author    = {Lee, John M.},
-    doi       = {10.1007/978-3-319-91755-9},
+    year      = {2018},
+    title     = {Introduction to Riemannian Manifolds},
     journal   = {Graduate Texts in Mathematics},
     publisher = {Springer International Publishing},
-    title     = {Introduction to Riemannian Manifolds},
-    year      = {2018}
+    doi       = {10.1007/978-3-319-91755-9}
 }
 
 @article{LengPan2018,
     author    = {Leng, Chenlei and Pan, Guangming},
-    doi       = {10.3150/17-BEJ980},
+    year      = {2018},
+    title     = {{Covariance estimation via sparse Kronecker structures}},
     journal   = {Bernoulli},
+    volume    = {24},
     number    = {4B},
     pages     = {3833 -- 3863},
     publisher = {Bernoulli Society for Mathematical Statistics and Probability},
-    title     = {{Covariance estimation via sparse Kronecker structures}},
-    volume    = {24},
-    year      = {2018}
+    doi       = {10.3150/17-BEJ980}
 }
 
 @article{Lenz1920,
     author  = {W. Lenz},
-    journal = {European Physical Journal A},
-    pages   = {613--615},
+    year    = {1920},
     title   = {Beitrag zum Verst{\"a}ndnis der magnetischen Erscheinungen in festen K{\"o}rpern},
-    url     = {https://cds.cern.ch/record/460663},
+    journal = {European Physical Journal A},
     volume  = {21},
-    year    = {1920}
+    pages   = {613--615},
+    url     = {https://cds.cern.ch/record/460663}
 }
 
 @article{LeporeEtAl2008,
     author  = {Lepore, Natasha and Brun, Caroline and Chou, Yi-Yu and Chiang, Ming-Chang and Dutton, Rebecca A. and Hayashi, Kiralee M. and Luders, Eileen and Lopez, Oscar L. and Aizenstein, Howard J. and Toga, Arthur W. and Becker, James T. and Thompson, Paul M.},
-    doi     = {10.1109/TMI.2007.906091},
+    year    = {2008},
+    title   = {Generalized Tensor-Based Morphometry of HIV/AIDS Using Multivariate Statistics on Deformation Tensors},
     journal = {IEEE Transactions on Medical Imaging},
+    volume  = {27},
     number  = {1},
     pages   = {129-141},
-    title   = {Generalized Tensor-Based Morphometry of HIV/AIDS Using Multivariate Statistics on Deformation Tensors},
-    volume  = {27},
-    year    = {2008}
+    doi     = {10.1109/TMI.2007.906091}
 }
 
 @article{LeurgansRoss1992,
     author    = {Sue Leurgans and Robert T. Ross},
-    doi       = {10.1214/ss/1177011225},
+    year      = {1992},
+    title     = {{Multilinear Models: Applications in Spectroscopy}},
     journal   = {Statistical Science},
-    keywords  = {Multi-mode factor analysis, nonlinear least-squares, PARAFAC, three-way arrays},
+    volume    = {7},
     number    = {3},
     pages     = {289 -- 310},
     publisher = {Institute of Mathematical Statistics},
-    title     = {{Multilinear Models: Applications in Spectroscopy}},
-    volume    = {7},
-    year      = {1992}
+    doi       = {10.1214/ss/1177011225},
+    keywords  = {Multi-mode factor analysis, nonlinear least-squares, PARAFAC, three-way arrays}
 }
 
 @article{LezonEtAl2006,
     author  = {Timothy R. Lezon and Jayanth R. Banavar and Marek Cieplak and Amos Maritan and Nina V. Fedoroff},
-    doi     = {10.1073/pnas.0609152103},
+    year    = {2006},
+    title   = {Using the principle of entropy maximization to infer genetic interaction networks from gene expression patterns},
     journal = {Proceedings of the National Academy of Sciences},
+    volume  = {103},
     number  = {50},
     pages   = {19033-19038},
-    title   = {Using the principle of entropy maximization to infer genetic interaction networks from gene expression patterns},
-    volume  = {103},
-    year    = {2006}
+    doi     = {10.1073/pnas.0609152103}
 }
 
 @article{Li1991,
     author   = {Li, Ker-Chau},
-    doi      = {10.1080/01621459.1991.10475035},
-    fjournal = {Journal of the American Statistical Association},
+    year     = {1991},
+    title    = {{Sliced Inverse Regression for Dimension Reduction}},
     journal  = {J. Amer. Statist. Assoc.},
+    volume   = {86},
     number   = {414},
     pages    = {316--327},
-    title    = {{Sliced Inverse Regression for Dimension Reduction}},
-    volume   = {86},
-    year     = {1991}
+    doi      = {10.1080/01621459.1991.10475035},
+    fjournal = {Journal of the American Statistical Association}
 }
 
 @article{Li1992,
     author    = {Li, Ker-Chau},
-    doi       = {10.1080/01621459.1992.10476258},
-    fjournal  = {Journal of the American Statistical Association},
-    issn      = {0162-1459,1537-274X},
+    year      = {1992},
+    title     = {On principal {H}essian directions for data visualization and dimension reduction: another application of {S}tein's lemma},
     journal   = {J. Amer. Statist. Assoc.},
+    volume    = {87},
     number    = {420},
     pages     = {1025--1039},
     publisher = {Taylor \& Francis},
-    title     = {On principal {H}essian directions for data visualization and dimension reduction: another application of {S}tein's lemma},
-    volume    = {87},
-    year      = {1992}
+    issn      = {0162-1459,1537-274X},
+    doi       = {10.1080/01621459.1992.10476258},
+    fjournal  = {Journal of the American Statistical Association}
 }
 
 @book{Li2018,
     author    = {Li, Bing},
+    year      = {2018},
+    title     = {Sufficient dimension reduction},
+    volume    = {161},
+    pages     = {xxi+283},
+    publisher = {CRC Press, Boca Raton, FL},
     doi       = {10.1201/9781315119427},
     isbn      = {978-1-4987-0447-2},
     mrclass   = {62-02 (62G08 62H12 62H20 62L10)},
     mrnumber  = {3838449},
     note      = {Methods and applications with R},
-    pages     = {xxi+283},
-    publisher = {CRC Press, Boca Raton, FL},
-    series    = {Monographs on Statistics and Applied Probability},
-    title     = {Sufficient dimension reduction},
-    url       = {https://doi.org/10.1201/9781315119427},
-    volume    = {161},
-    year      = {2018}
+    series    = {Monographs on Statistics and Applied Probability}
 }
 
 @article{LiArtemiouLi2011,
     author    = {Li, Bing and Artemiou, Andreas and Li, Lexin},
-    doi       = {10.1214/11-AOS932},
-    fjournal  = {The Annals of Statistics},
+    year      = {2011},
+    title     = {Principal support vector machines for linear and nonlinear sufficient dimension reduction},
     journal   = {Ann. Statist.},
-    month     = {12},
+    volume    = {39},
     number    = {6},
     pages     = {3182--3210},
     publisher = {The Institute of Mathematical Statistics},
-    title     = {Principal support vector machines for linear and nonlinear sufficient dimension reduction},
-    url       = {https://doi.org/10.1214/11-AOS932},
-    volume    = {39},
-    year      = {2011}
+    doi       = {10.1214/11-AOS932},
+    fjournal  = {The Annals of Statistics},
+    month     = {12}
 }
 
 @article{LiKimAltman2010,
     author    = {Bing Li and Min Kyung Kim and Naomi Altman},
-    doi       = {10.1214/09-AOS737},
+    year      = {2010},
+    title     = {{On dimension folding of matrix- or array-valued statistical objects}},
     journal   = {The Annals of Statistics},
-    keywords  = {directional regression, electroencephalography, Kronecker envelope, sliced average variance estimate, sliced inverse regression},
+    volume    = {38},
     number    = {2},
     pages     = {1094 -- 1121},
     publisher = {Institute of Mathematical Statistics},
-    title     = {{On dimension folding of matrix- or array-valued statistical objects}},
-    volume    = {38},
-    year      = {2010}
+    doi       = {10.1214/09-AOS737}
 }
 
 @article{Lin2019,
     author  = {Lin, Zhenhua},
-    doi     = {10.1137/18M1221084},
+    year    = {2019},
+    title   = {Riemannian Geometry of Symmetric Positive Definite Matrices via Cholesky Decomposition},
     journal = {SIAM Journal on Matrix Analysis and Applications},
+    volume  = {40},
     number  = {4},
     pages   = {1353--1370},
-    title   = {Riemannian Geometry of Symmetric Positive Definite Matrices via Cholesky Decomposition},
-    volume  = {40},
-    year    = {2019}
+    doi     = {10.1137/18M1221084}
 }
 
 @misc{LiuEtAl2023,
-    archiveprefix = {arXiv},
     author        = {Tianyu Liu and Somabha Mukherjee and Rahul Biswas},
-    eprint        = {2304.00530},
-    primaryclass  = {math.ST},
+    year          = {2023},
     title         = {Tensor Recovery in High-Dimensional Ising Models},
-    year          = {2023}
+    eprint        = {2304.00530},
+    archiveprefix = {arXiv},
+    primaryclass  = {math.ST}
 }
 
 @inbook{LiuKoike2007,
     author    = {Chunxue Liu and Katsuaki Koike},
-    doi       = {10.1007/s11004-007-9085-9},
+    year      = {2007},
+    title     = {Extending Multivariate Space-Time Geostatistics for Environmental Data Analysis},
     journal   = {Mathematical Geology},
     pages     = {289--305},
     publisher = {International Association for Mathematical Geology},
-    title     = {Extending Multivariate Space-Time Geostatistics for Environmental Data Analysis},
-    year      = {2007}
+    doi       = {10.1007/s11004-007-9085-9}
 }
 
 @article{LiWang2007,
     author    = {Bing Li and Shaoli Wang},
-    doi       = {10.1198/016214507000000536},
+    year      = {2007},
+    title     = {On Directional Regression for Dimension Reduction},
     journal   = {Journal of the American Statistical Association},
+    volume    = {102},
     number    = {479},
     pages     = {997-1008},
     publisher = {Taylor \& Francis},
-    title     = {On Directional Regression for Dimension Reduction},
-    volume    = {102},
-    year      = {2007}
+    doi       = {10.1198/016214507000000536}
 }
 
 @article{LiZhaChiaromonte2005,
     author    = {Li, Bing and Zha, Hongyuan and Chiaromonte, Francesca},
-    doi       = {10.1214/009053605000000192},
-    fjournal  = {The Annals of Statistics},
+    year      = {2005},
+    title     = {Contour regression: A general approach to dimension reduction},
     journal   = {Ann. Statist.},
+    volume    = {33},
     number    = {4},
     pages     = {1580--1616},
     publisher = {The Institute of Mathematical Statistics},
-    title     = {Contour regression: A general approach to dimension reduction},
-    url       = {https://doi.org/10.1214/009053605000000192},
-    volume    = {33},
-    year      = {2005}
+    doi       = {10.1214/009053605000000192},
+    fjournal  = {The Annals of Statistics}
 }
 
 @article{LiZhang2017,
     author    = {Lexin Li and Xin Zhang},
-    doi       = {10.1080/01621459.2016.1193022},
+    year      = {2017},
+    title     = {Parsimonious Tensor Response Regression},
     journal   = {Journal of the American Statistical Association},
+    volume    = {112},
     number    = {519},
     pages     = {1131-1146},
     publisher = {Taylor & Francis},
-    title     = {Parsimonious Tensor Response Regression},
-    volume    = {112},
-    year      = {2017}
+    doi       = {10.1080/01621459.2016.1193022}
 }
 
 @article{Lock2018,
     author    = {Eric F. Lock},
-    doi       = {10.1080/10618600.2017.1401544},
+    year      = {2018},
+    title     = {Tensor-on-Tensor Regression},
     journal   = {Journal of Computational and Graphical Statistics},
+    volume    = {27},
     number    = {3},
     pages     = {638-647},
     publisher = {Taylor \& Francis},
-    title     = {Tensor-on-Tensor Regression},
-    volume    = {27},
-    year      = {2018}
+    doi       = {10.1080/10618600.2017.1401544}
 }
 
 @article{LuoLi2016,
     author  = {Luo, Wei and Li, Bing},
-    doi     = {10.1093/biomet/asw051},
-    issn    = {0006-3444, 1464-3510},
+    year    = {2016},
+    title   = {Combining eigenvalues and variation of eigenvectors for order determination},
     journal = {Biometrika},
-    month   = {12},
+    volume  = {103},
     number  = {4},
     pages   = {875--887},
-    title   = {Combining eigenvalues and variation of eigenvectors for order determination},
-    url     = {https://academic.oup.com/biomet/article-lookup/doi/10.1093/biomet/asw051},
-    urldate = {2021-10-06},
-    volume  = {103},
-    year    = {2016}
+    issn    = {0006-3444, 1464-3510},
+    doi     = {10.1093/biomet/asw051},
+    month   = {12}
 }
 
 @article{LuoLi2021,
     author  = {Luo, Wei and Li, Bing},
-    doi     = {10.1093/biomet/asaa077},
-    issn    = {0006-3444, 1464-3510},
+    year    = {2021},
+    title   = {On order determination by predictor augmentation},
     journal = {Biometrika},
-    month   = {08},
+    volume  = {108},
     number  = {3},
     pages   = {557--574},
-    title   = {On order determination by predictor augmentation},
-    url     = {https://academic.oup.com/biomet/article/108/3/557/5917626},
-    urldate = {2021-10-06},
-    volume  = {108},
-    year    = {2021}
+    issn    = {0006-3444, 1464-3510},
+    doi     = {10.1093/biomet/asaa077},
+    month   = {08}
 }
 
 @article{LuZimmerman2005,
     author  = {Nelson Lu and Dale L. Zimmerman},
-    doi     = {10.1016/j.spl.2005.04.020},
-    issn    = {0167-7152},
+    year    = {2005},
+    title   = {The likelihood ratio test for a separable covariance matrix},
     journal = {Statistics \& Probability Letters},
+    volume  = {73},
     number  = {4},
     pages   = {449-457},
-    title   = {The likelihood ratio test for a separable covariance matrix},
-    url     = {https://www.sciencedirect.com/science/article/pii/S0167715205001495},
-    volume  = {73},
-    year    = {2005}
+    issn    = {0167-7152},
+    doi     = {10.1016/j.spl.2005.04.020}
 }
 
 @article{MagnusNeudecker1986,
     author    = {Magnus, Jan R. and Neudecker, Heinz},
-    issn      = {02664666, 14694360},
+    year      = {1986},
+    title     = {Symmetry, 0-1 Matrices and Jacobians: A Review},
     journal   = {Econometric Theory},
+    volume    = {2},
     number    = {2},
     pages     = {157--190},
     publisher = {Cambridge University Press},
-    title     = {Symmetry, 0-1 Matrices and Jacobians: A Review},
+    issn      = {02664666, 14694360},
     url       = {http://www.jstor.org/stable/3532421},
-    urldate   = {2023-10-03},
-    volume    = {2},
-    year      = {1986}
+    urldate   = {2023-10-03}
 }
 
 @book{MagnusNeudecker1999,
     author    = {Magnus, Jan R. and Neudecker, Heinz},
+    year      = {1999},
+    title     = {Matrix differential calculus with applications in statistics and econometrics},
+    pages     = {xviii+395},
+    publisher = {John Wiley \& Sons, Ltd., Chichester},
     isbn      = {0-471-98633-X},
     mrclass   = {15-01 (26-01 62-01)},
     mrnumber  = {1698873},
     note      = {Revised reprint of the 1988 original},
-    pages     = {xviii+395},
-    publisher = {John Wiley \& Sons, Ltd., Chichester},
-    series    = {Wiley Series in Probability and Statistics},
-    title     = {Matrix differential calculus with applications in statistics and econometrics},
-    year      = {1999}
+    series    = {Wiley Series in Probability and Statistics}
 }
 
 @article{ManceurDutilleul2013,
     author  = {Ameur M. Manceur and Pierre Dutilleul},
-    doi     = {10.1016/j.cam.2012.09.017},
-    issn    = {0377-0427},
-    journal = {Journal of Computational and Applied Mathematics},
-    pages   = {37-49},
+    year    = {2013},
     title   = {Maximum likelihood estimation for the tensor normal distribution: Algorithm, minimum sample size, and empirical bias and dispersion},
-    url     = {https://www.sciencedirect.com/science/article/pii/S0377042712003810},
+    journal = {Journal of Computational and Applied Mathematics},
     volume  = {239},
-    year    = {2013}
+    pages   = {37-49},
+    issn    = {0377-0427},
+    doi     = {10.1016/j.cam.2012.09.017}
 }
 
 @incollection{MardiaGoodall1993,
     author    = {Mardia, Kanti V. and Goodall, Colin R.},
+    year      = {1993},
+    title     = {Spatial-temporal analysis of multivariate environmental monitoring data},
+    volume    = {6},
+    pages     = {347--386},
+    publisher = {North-Holland, Amsterdam},
     booktitle = {Multivariate environmental statistics},
     isbn      = {0-444-89804-2},
     mrclass   = {62H11},
     mrnumber  = {1268443},
-    pages     = {347--386},
-    publisher = {North-Holland, Amsterdam},
-    series    = {North-Holland Ser. Statist. Probab.},
-    title     = {Spatial-temporal analysis of multivariate environmental monitoring data},
-    volume    = {6},
-    year      = {1993}
+    series    = {North-Holland Ser. Statist. Probab.}
 }
 
 @inproceedings{MartinFernandez2004,
-    address   = {Berlin, Heidelberg},
     author    = {Mart{\'i}n-Fern{\'a}ndez, Marcos and Westin, Carl-Fredrik and Alberola-L{\'o}pez, Carlos},
-    booktitle = {Medical Image Computing and Computer-Assisted Intervention -- MICCAI 2004},
-    editor    = {Barillot, Christian and Haynor, David R. and Hellier, Pierre},
-    isbn      = {978-3-540-30135-6},
+    year      = {2004},
+    title     = {3D Bayesian Regularization of Diffusion Tensor MRI Using Multivariate Gaussian Markov Random Fields},
     pages     = {351--359},
     publisher = {Springer Berlin Heidelberg},
-    title     = {3D Bayesian Regularization of Diffusion Tensor MRI Using Multivariate Gaussian Markov Random Fields},
-    year      = {2004}
+    address   = {Berlin, Heidelberg},
+    booktitle = {Medical Image Computing and Computer-Assisted Intervention -- MICCAI 2004},
+    editor    = {Barillot, Christian and Haynor, David R. and Hellier, Pierre},
+    isbn      = {978-3-540-30135-6}
 }
 
 @article{MaZhu2013,
     author   = {Ma, Yanyuan and Zhu, Liping},
-    doi      = {10.1111/j.1751-5823.2012.00182.x},
-    fjournal = {International Statistical Review. Revue Internationale de Statistique},
-    issn     = {0306-7734,1751-5823},
+    year     = {2013},
+    title    = {A review on dimension reduction},
     journal  = {Int. Stat. Rev.},
-    mrclass  = {62G08 (62-02 62H12)},
-    mrnumber = {3047506},
+    volume   = {81},
     number   = {1},
     pages    = {134--150},
-    title    = {A review on dimension reduction},
-    url      = {https://doi.org/10.1111/j.1751-5823.2012.00182.x},
-    volume   = {81},
-    year     = {2013}
+    issn     = {0306-7734,1751-5823},
+    doi      = {10.1111/j.1751-5823.2012.00182.x},
+    fjournal = {International Statistical Review. Revue Internationale de Statistique},
+    mrclass  = {62G08 (62-02 62H12)},
+    mrnumber = {3047506}
 }
 
 @book{McCullagh1987,
     author    = {McCullagh, Peter},
-    doi       = {10.1201/9781351077118},
-    publisher = {Chapman and Hall/CRC},
-    subtitle  = {Monographs on Statistics and Applied Probability},
+    year      = {1987},
     title     = {Tensor Methods in Statistics},
-    year      = {1987}
+    publisher = {Chapman and Hall/CRC},
+    doi       = {10.1201/9781351077118},
+    subtitle  = {Monographs on Statistics and Applied Probability}
 }
 
 @article{McCullochPitts1943,
     author    = {Mc{C}ulloch, Warren S and Pitts, Walter},
-    journal   = {Bulletin of Mathematical Biophysics},
-    pages     = {115--133},
-    publisher = {Springer},
+    year      = {1943},
     title     = {A Logical Calculus of the Ideas Immanent in Nervous Activity},
+    journal   = {Bulletin of Mathematical Biophysics},
     volume    = {5},
-    year      = {1943}
+    pages     = {115--133},
+    publisher = {Springer}
 }
 
-@misc{MukherjeeEtAl2020,
-    archiveprefix = {arXiv},
-    author        = {Somabha Mukherjee and Jaesung Son and Bhaswar B. Bhattacharya},
-    eprint        = {2008.12882},
-    primaryclass  = {math.ST},
-    title         = {Estimation in Tensor Ising Models},
-    year          = {2020}
+@article{MukherjeeEtAl2022,
+    author  = {Mukherjee, Somabha and Son, Jaesung and Bhattacharya, Bhaswar B},
+    year    = {2022},
+    title   = {Estimation in tensor Ising models},
+    journal = {Information and Inference: A Journal of the IMA},
+    volume  = {11},
+    number  = {4},
+    pages   = {1457--1500},
+    issn    = {2049-8772},
+    doi     = {10.1093/imaiai/iaac007},
+    month   = {06}
 }
 
 @article{Nadarajah2005,
     author    = {Saralees Nadarajah},
-    doi       = {10.1080/02664760500079464},
+    year      = {2005},
+    title     = {A generalized normal distribution},
     journal   = {Journal of Applied Statistics},
+    volume    = {32},
     number    = {7},
     pages     = {685--694},
     publisher = {Taylor \& Francis},
-    title     = {A generalized normal distribution},
-    volume    = {32},
-    year      = {2005}
+    doi       = {10.1080/02664760500079464}
 }
 
 @inproceedings{Nesterov1983,
     author       = {Nesterov, Yurii Evgen'evich},
-    booktitle    = {Doklady Akademii Nauk},
-    number       = {3},
-    organization = {Russian Academy of Sciences},
-    pages        = {543--547},
+    year         = {1983},
     title        = {A method of solving a convex programming problem with convergence rate $O(1/k^2)$},
     volume       = {269},
-    year         = {1983}
+    number       = {3},
+    pages        = {543--547},
+    booktitle    = {Doklady Akademii Nauk},
+    organization = {Russian Academy of Sciences}
 }
 
 @article{NguyenEtAl2017,
     author    = {H. Chau Nguyen and Riccardo Zecchina and Johannes Berg},
-    doi       = {10.1080/00018732.2017.1341604},
+    year      = {2017},
+    title     = {Inverse statistical problems: from the inverse {I}sing problem to data science},
     journal   = {Advances in Physics},
+    volume    = {66},
     number    = {3},
     pages     = {197--261},
     publisher = {Taylor \& Francis},
-    title     = {Inverse statistical problems: from the inverse {I}sing problem to data science},
-    volume    = {66},
-    year      = {2017}
+    doi       = {10.1080/00018732.2017.1341604}
 }
 
 @article{Niss2005,
     author   = {Niss, Martin},
-    doi      = {10.1007/s00407-004-0088-3},
-    fjournal = {Archive for History of Exact Sciences},
-    issn     = {1432-0657},
+    year     = {2005},
+    title    = {{History of the Lenz-Ising Model 1920--1950: From Ferromagnetic to Cooperative Phenomena}},
     journal  = {Arch. Hist. Exact Sci.},
+    volume   = {59},
     number   = {3},
     pages    = {267--318},
-    title    = {{History of the Lenz-Ising Model 1920--1950: From Ferromagnetic to Cooperative Phenomena}},
-    volume   = {59},
-    year     = {2005}
+    issn     = {1432-0657},
+    doi      = {10.1007/s00407-004-0088-3},
+    fjournal = {Archive for History of Exact Sciences}
 }
 
 @article{OhlsonEtAl2013,
     author  = {Ohlson, Martin and Ahmad, Mumtaz Rauf and von Rosen, Dietrich},
-    doi     = {10.1016/j.jmva.2011.05.015},
-    issn    = {0047-259X},
-    journal = {Journal of Multivariate Analysis},
-    pages   = {37-47},
+    year    = {2013},
     title   = {The multilinear normal distribution: Introduction and some basic properties},
-    url     = {https://www.sciencedirect.com/science/article/pii/S0047259X11001047},
+    journal = {Journal of Multivariate Analysis},
     volume  = {113},
-    year    = {2013}
+    pages   = {37-47},
+    issn    = {0047-259X},
+    doi     = {10.1016/j.jmva.2011.05.015}
 }
 
 @article{Oseledets2011,
     author  = {Oseledets, I. V.},
-    doi     = {10.1137/090752286},
+    year    = {2011},
+    title   = {Tensor-Train Decomposition},
     journal = {SIAM Journal on Scientific Computing},
+    volume  = {33},
     number  = {5},
     pages   = {2295-2317},
-    title   = {Tensor-Train Decomposition},
-    volume  = {33},
-    year    = {2011}
+    doi     = {10.1137/090752286}
 }
 
 @article{PanMaiZhang2018,
     author    = {Yuqing Pan and Qing Mai and Xin Zhang},
-    doi       = {10.1080/01621459.2018.1497500},
-    eprint    = {https://doi.org/10.1080/01621459.2018.1497500},
+    year      = {2018},
+    title     = {Covariate-Adjusted Tensor Classification in High dimensions},
     journal   = {Journal of the American Statistical Association},
+    volume    = {0},
     number    = {ja},
     pages     = {1-41},
     publisher = {Taylor & Francis},
-    title     = {Covariate-Adjusted Tensor Classification in High dimensions},
-    url       = {https://doi.org/10.1080/01621459.2018.1497500},
-    volume    = {0},
-    year      = {2018}
+    doi       = {10.1080/01621459.2018.1497500}
 }
 
 @book{Pepe03,
-    address   = {New York},
     author    = {Pepe, M.S.},
-    publisher = {Oxford University Press},
+    year      = {2003},
     title     = {The Statistical Evaluation of Medical Tests for Classification and Prediction},
-    year      = {2003}
+    publisher = {Oxford University Press},
+    address   = {New York}
 }
 
 @article{PfeifferForzaniBura2012,
     author  = {Pfeiffer, Ruth and Forzani, Liliana and Bura, Efstathia},
-    doi     = {10.1002/sim.4437},
-    journal = {Statistics in medicine},
-    month   = {09},
-    pages   = {2414-27},
+    year    = {2012},
     title   = {Sufficient dimension reduction for longitudinally measured predictors},
+    journal = {Statistics in medicine},
     volume  = {31},
-    year    = {2012}
+    pages   = {2414-27},
+    doi     = {10.1002/sim.4437},
+    month   = {09}
 }
 
 @article{PfeifferKaplaBura2021,
     author  = {Pfeiffer, Ruth and Kapla, Daniel and Bura, Efstathia},
-    doi     = {10.1007/s41060-020-00228-y},
-    journal = {International Journal of Data Science and Analytics},
+    year    = {2021},
     title   = {{Least squares and maximum likelihood estimation of sufficient reductions in regressions with matrix-valued predictors}},
+    journal = {International Journal of Data Science and Analytics},
     volume  = {11},
-    year    = {2021}
+    doi     = {10.1007/s41060-020-00228-y}
 }
 
 @inproceedings{RabusseauKadri2016,
     author    = {Rabusseau, Guillaume and Kadri, Hachem},
-    booktitle = {Advances in Neural Information Processing Systems},
-    editor    = {D. Lee and M. Sugiyama and U. Luxburg and I. Guyon and R. Garnett},
+    year      = {2016},
+    title     = {Low-Rank Regression with Tensor Responses},
+    volume    = {29},
     pages     = {},
     publisher = {Curran Associates, Inc.},
-    title     = {Low-Rank Regression with Tensor Responses},
     url       = {https://proceedings.neurips.cc/paper_files/paper/2016/file/3806734b256c27e41ec2c6bffa26d9e7-Paper.pdf},
-    volume    = {29},
-    year      = {2016}
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor    = {D. Lee and M. Sugiyama and U. Luxburg and I. Guyon and R. Garnett}
 }
 
 @article{Rosenblatt1958,
     author  = {Frank Rosenblatt},
-    doi     = {10.1037/h0042519},
+    year    = {1958},
+    title   = {The perceptron: A probabilistic model for information storage and organization in the brain},
     journal = {Psychological Review},
+    volume  = {65},
     number  = {6},
     pages   = {386--408},
-    title   = {The perceptron: A probabilistic model for information storage and organization in the brain},
-    volume  = {65},
-    year    = {1958}
+    doi     = {10.1037/h0042519}
 }
 
 
 @inproceedings{Rumelhart1986,
     author = {David E. Rumelhart and Geoffrey E. Hinton and Ronald J. Williams},
+    year   = {1986},
     title  = {Learning internal representations by error propagation},
-    url    = {https://api.semanticscholar.org/CorpusID:62245742},
-    year   = {1986}
+    url    = {https://api.semanticscholar.org/CorpusID:62245742}
 }
 
 @article{RuppertWand1994,
     author    = {D. Ruppert and M. P. Wand},
-    issn      = {00905364},
+    year      = {1994},
+    title     = {Multivariate Locally Weighted Least Squares Regression},
     journal   = {The Annals of Statistics},
+    volume    = {22},
     number    = {3},
     pages     = {1346--1370},
     publisher = {Institute of Mathematical Statistics},
-    title     = {Multivariate Locally Weighted Least Squares Regression},
+    issn      = {00905364},
     url       = {http://www.jstor.org/stable/2242229},
-    urldate   = {2024-01-25},
-    volume    = {22},
-    year      = {1994}
+    urldate   = {2024-01-25}
 }
 
 @article{SchneidmanEtAl2006,
     author  = {Schneidman, Elad and Berry, Michael J. and Segev, Ronen and Bialek, William},
-    day     = {01},
-    doi     = {10.1038/nature04701},
-    issn    = {1476-4687},
+    year    = {2006},
+    title   = {Weak pairwise correlations imply strongly correlated network states in a neural population},
     journal = {Nature},
-    month   = {Apr},
+    volume  = {440},
     number  = {7087},
     pages   = {1007-1012},
-    title   = {Weak pairwise correlations imply strongly correlated network states in a neural population},
-    volume  = {440},
-    year    = {2006}
+    issn    = {1476-4687},
+    doi     = {10.1038/nature04701},
+    day     = {01},
+    month   = {Apr}
 }
 
 @inproceedings{ShanEtAl2008,
     author    = {Shiguang Shan and Bo Cao and Yu Su and Laiyun Qing and Xilin Chen and Wen Gao},
-    booktitle = {2008 IEEE Conference on Computer Vision and Pattern Recognition},
-    doi       = {10.1109/CVPR.2008.4587375},
-    issn      = {1063-6919},
-    number    = {},
-    pages     = {1-7},
+    year      = {2008},
     title     = {Unified Principal Component Analysis with generalized Covariance Matrix for face recognition},
     volume    = {},
-    year      = {2008}
+    number    = {},
+    pages     = {1-7},
+    issn      = {1063-6919},
+    doi       = {10.1109/CVPR.2008.4587375},
+    booktitle = {2008 IEEE Conference on Computer Vision and Pattern Recognition}
 }
 
 @inproceedings{ShashuaHazan2005,
-    address   = {New York, NY, USA},
     author    = {Shashua, Amnon and Hazan, Tamir},
-    booktitle = {Proceedings of the 22nd International Conference on Machine Learning},
+    year      = {2005},
+    title     = {Non-Negative Tensor Factorization with Applications to Statistics and Computer Vision},
+    pages     = {792--799},
+    publisher = {Association for Computing Machinery},
     doi       = {10.1145/1102351.1102451},
+    address   = {New York, NY, USA},
+    booktitle = {Proceedings of the 22nd International Conference on Machine Learning},
     isbn      = {1595931805},
     location  = {Bonn, Germany},
     numpages  = {8},
-    pages     = {792--799},
-    publisher = {Association for Computing Machinery},
-    series    = {ICML '05},
-    title     = {Non-Negative Tensor Factorization with Applications to Statistics and Computer Vision},
-    year      = {2005}
+    series    = {ICML '05}
 }
 
 @inproceedings{Smolensky1986,
     author = {Paul Smolensky},
+    year   = {1986},
     title  = {Information Processing in Dynamical Systems: Foundations of Harmony Theory},
-    url    = {https://stanford.edu/~jlmcc/papers/PDP/Volume%201/Chap6_PDP86.pdf},
-    year   = {1986}
+    url    = {https://stanford.edu/~jlmcc/papers/PDP/Volume%201/Chap6_PDP86.pdf}
 }
 
 @article{Soize2008,
     author  = {C. Soize},
-    doi     = {10.1016/j.probengmech.2007.12.019},
-    issn    = {0266-8920},
+    year    = {2008},
+    title   = {Tensor-valued random fields for meso-scale stochastic model of anisotropic elastic microstructure and probabilistic analysis of representative volume element size},
     journal = {Probabilistic Engineering Mechanics},
-    note    = {5th International Conference on Computational Stochastic Mechanics},
+    volume  = {23},
     number  = {2},
     pages   = {307-323},
-    title   = {Tensor-valued random fields for meso-scale stochastic model of anisotropic elastic microstructure and probabilistic analysis of representative volume element size},
-    url     = {https://www.sciencedirect.com/science/article/pii/S0266892007000562},
-    volume  = {23},
-    year    = {2008}
+    issn    = {0266-8920},
+    doi     = {10.1016/j.probengmech.2007.12.019},
+    note    = {5th International Conference on Computational Stochastic Mechanics}
 }
 
 @article{SoloveychikTrushin2016,
     author  = {I. Soloveychik and D. Trushin},
-    doi     = {10.1016/j.jmva.2016.04.001},
-    issn    = {0047-259X},
-    journal = {Journal of Multivariate Analysis},
-    pages   = {92-113},
+    year    = {2016},
     title   = {Gaussian and robust Kronecker product covariance estimation: Existence and uniqueness},
-    url     = {https://www.sciencedirect.com/science/article/pii/S0047259X16300070},
+    journal = {Journal of Multivariate Analysis},
     volume  = {149},
-    year    = {2016}
+    pages   = {92-113},
+    issn    = {0047-259X},
+    doi     = {10.1016/j.jmva.2016.04.001}
 }
 
 @misc{SongHero2023,
-    archiveprefix = {arXiv},
     author        = {Dogyoon Song and Alfred O. Hero},
-    doi           = {10.48550/arXiv.2302.02415},
-    eprint        = {2302.02415},
-    primaryclass  = {math.ST},
+    year          = {2023},
     title         = {On Separability of Covariance in Multiway Data Analysis},
-    year          = {2023}
+    eprint        = {2302.02415},
+    archiveprefix = {arXiv},
+    primaryclass  = {math.ST}
 }
 
 @article{SrivastavaEtAl2008,
     author  = {Srivastava, Muni Shanker and von Rosen, Tatjana and von Rosen, Dietrich},
-    day     = {01},
-    doi     = {10.3103/S1066530708040066},
-    issn    = {1934-8045},
+    year    = {2008},
+    title   = {Models with a Kronecker product covariance structure: Estimation and testing},
     journal = {Mathematical Methods of Statistics},
-    month   = {Dec},
+    volume  = {17},
     number  = {4},
     pages   = {357-370},
-    title   = {Models with a Kronecker product covariance structure: Estimation and testing},
-    volume  = {17},
-    year    = {2008}
+    issn    = {1934-8045},
+    doi     = {10.3103/S1066530708040066},
+    day     = {01},
+    month   = {Dec}
 }
 
 @article{SrivastavaEtAl2014,
     author  = {Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov},
+    year    = {2014},
+    title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
     journal = {Journal of Machine Learning Research},
+    volume  = {15},
     number  = {56},
     pages   = {1929-1958},
-    title   = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
-    url     = {http://jmlr.org/papers/v15/srivastava14a.html},
-    volume  = {15},
-    year    = {2014}
+    url     = {http://jmlr.org/papers/v15/srivastava14a.html}
 }
 
 @book{SrivastavaKhatri1979,
-    address   = {New York, NY [u.a.]},
     author    = {Srivastava, Muni Shanker and Khatri, Chinubal G.},
-    isbn      = {0444003029},
-    language  = {eng},
-    publisher = {North Holland},
+    year      = {1979},
     title     = {An introduction to multivariate statistics},
-    year      = {1979}
+    publisher = {North Holland},
+    address   = {New York, NY [u.a.]},
+    isbn      = {0444003029},
+    language  = {eng}
 }
 
 @article{Steinberger2018,
     author    = {Steinberger, Lukas and Leeb, Hannes},
-    doi       = {10.3150/16-BEJ888},
-    fjournal  = {Bernoulli},
+    year      = {2018},
+    title     = {On conditional moments of high-dimensional random vectors given lower-dimensional projections},
     journal   = {Bernoulli},
+    volume    = {24},
     number    = {1},
     pages     = {565--591},
     publisher = {Bernoulli Society for Mathematical Statistics and Probability},
-    title     = {On conditional moments of high-dimensional random vectors given lower-dimensional projections},
-    volume    = {24},
-    year      = {2018}
+    doi       = {10.3150/16-BEJ888},
+    fjournal  = {Bernoulli}
 }
 
 @article{Tseng1993,
     author  = {Paul Tseng},
-    journal = {Mathematical Programming},
-    number  = {1},
-    pages   = {231-247},
+    year    = {1993},
     title   = {Dual coordinate ascent methods for non-strictly convex minimization},
+    journal = {Mathematical Programming},
     volume  = {59},
-    year    = {1993}
+    number  = {1},
+    pages   = {231-247}
 }
 
 @article{TsiligkaridisHero2013,
     author  = {Tsiligkaridis, Theodoros and Hero, Alfred O.},
-    doi     = {10.1109/TSP.2013.2279355},
+    year    = {2013},
+    title   = {Covariance Estimation in High Dimensions Via Kronecker Product Expansions},
     journal = {IEEE Transactions on Signal Processing},
+    volume  = {61},
     number  = {21},
     pages   = {5347-5360},
-    title   = {Covariance Estimation in High Dimensions Via Kronecker Product Expansions},
-    volume  = {61},
-    year    = {2013}
+    doi     = {10.1109/TSP.2013.2279355}
 }
 
 @inbook{Uschmajew2020,
-    address   = {Cham},
     author    = {Uschmajew, Andr{\'e} and Vandereycken, Bart},
-    booktitle = {Handbook of Variational Methods for Nonlinear Geometric Data},
-    doi       = {10.1007/978-3-030-31351-7_9},
-    editor    = {Grohs, Philipp and Holler, Martin and Weinmann, Andreas},
-    isbn      = {978-3-030-31351-7},
+    year      = {2020},
+    title     = {Geometric Methods on Low-Rank Matrix and Tensor Manifolds},
     pages     = {261--313},
     publisher = {Springer International Publishing},
-    title     = {Geometric Methods on Low-Rank Matrix and Tensor Manifolds},
-    year      = {2020}
+    doi       = {10.1007/978-3-030-31351-7_9},
+    address   = {Cham},
+    booktitle = {Handbook of Variational Methods for Nonlinear Geometric Data},
+    editor    = {Grohs, Philipp and Holler, Martin and Weinmann, Andreas},
+    isbn      = {978-3-030-31351-7}
 }
 
 @book{vanderVaart1998,
     author    = {{van der Vaart}, A.W.},
-    isbn      = {0-521-49603-9},
-    publisher = {Cambridge University Press},
-    series    = {Asymptotic Statistics},
-    series    = {Cambridge Series in Statistical and Probabilistic Mathematics},
+    year      = {1998},
     title     = {Asymptotic Statistics},
-    year      = {1998}
+    publisher = {Cambridge University Press},
+    isbn      = {0-521-49603-9},
+    series    = {Asymptotic Statistics},
+    series    = {Cambridge Series in Statistical and Probabilistic Mathematics}
 }
 
 @inbook{VanLoanPitsianis1993,
-    address   = {Dordrecht},
     author    = {Van Loan, C. F. and Pitsianis, N.},
-    booktitle = {Linear Algebra for Large Scale and Real-Time Applications},
-    doi       = {10.1007/978-94-015-8196-7_17},
-    editor    = {Moonen, Marc S. and Golub, Gene H. and De Moor, Bart L. R.},
-    isbn      = {978-94-015-8196-7},
+    year      = {1993},
+    title     = {Approximation with Kronecker Products},
     pages     = {293--314},
     publisher = {Springer Netherlands},
-    title     = {Approximation with Kronecker Products},
-    year      = {1993}
+    doi       = {10.1007/978-94-015-8196-7_17},
+    address   = {Dordrecht},
+    booktitle = {Linear Algebra for Large Scale and Real-Time Applications},
+    editor    = {Moonen, Marc S. and Golub, Gene H. and De Moor, Bart L. R.},
+    isbn      = {978-94-015-8196-7}
 }
 
 @article{WainwrightJordan2008,
     author  = {Martin J. Wainwright and Michael I. Jordan},
-    doi     = {10.1561/2200000001},
-    issn    = {1935-8237},
+    year    = {2008},
+    title   = {Graphical Models, Exponential Families, and Variational Inference},
     journal = {Foundations and Trends® in Machine Learning},
+    volume  = {1},
     number  = {1--2},
     pages   = {1-305},
-    title   = {Graphical Models, Exponential Families, and Variational Inference},
-    url     = {http://dx.doi.org/10.1561/2200000001},
-    volume  = {1},
-    year    = {2008}
+    issn    = {1935-8237},
+    doi     = {10.1561/2200000001}
 }
 
 @article{WangEtAl2022,
     author    = {Yu Wang and Zeyu Sun and Dogyoon Song and Alfred Hero},
-    doi       = {10.1214/22-SS139},
+    year      = {2022},
+    title     = {{Kronecker-structured covariance models for multiway data}},
     journal   = {Statistics Surveys},
+    volume    = {16},
     number    = {none},
     pages     = {238 -- 270},
     publisher = {Amer. Statist. Assoc., the Bernoulli Soc., the Inst. Math. Statist., and the Statist. Soc. Canada},
-    title     = {{Kronecker-structured covariance models for multiway data}},
-    volume    = {16},
-    year      = {2022}
+    doi       = {10.1214/22-SS139}
 }
 
 @article{WangLi2020,
     author  = {Miaoyan Wang and Lexin Li},
-    journal = {Journal of Machine Learning Research},
-    number  = {154},
-    pages   = {1--38},
+    year    = {2020},
     title   = {Learning from Binary Multiway Data: Probabilistic Tensor Decomposition and its Statistical Optimality},
+    journal = {Journal of Machine Learning Research},
     volume  = {21},
-    year    = {2020}
+    number  = {154},
+    pages   = {1--38}
 }
 
 @article{WangXia2008,
     author    = {Hansheng Wang and Yingcun Xia},
-    doi       = {10.1198/016214508000000418},
-    fjournal  = {Journal of the American Statistical Association},
+    year      = {2008},
+    title     = {{Sliced Regression for Dimension Reduction}},
     journal   = {J. Amer. Statist. Assoc.},
+    volume    = {103},
     number    = {482},
     pages     = {811--821},
     publisher = {Taylor \& Francis},
-    title     = {{Sliced Regression for Dimension Reduction}},
-    volume    = {103},
-    year      = {2008}
+    doi       = {10.1198/016214508000000418},
+    fjournal  = {Journal of the American Statistical Association}
 }
 
 @book{Whittaker1990,
     author    = {Whittaker, Joe},
+    year      = {1990},
+    title     = {Graphical models in applied multivariate statistics},
+    pages     = {xiv+448},
+    publisher = {John Wiley \& Sons, Ltd., Chichester},
     isbn      = {0-471-91750-8},
     mrclass   = {62-02 (62H17 62J12)},
     mrnumber  = {1112133},
-    pages     = {xiv+448},
-    publisher = {John Wiley \& Sons, Ltd., Chichester},
-    series    = {Wiley Series in Probability and Mathematical Statistics: Probability and Mathematical Statistics},
-    title     = {Graphical models in applied multivariate statistics},
-    year      = {1990}
+    series    = {Wiley Series in Probability and Mathematical Statistics: Probability and Mathematical Statistics}
 }
 
 @book{Whittaker2009,
     author    = {J. Whittaker},
-    publisher = {Wiley},
+    year      = {2009},
     title     = {Graphical Models in Applied Multivariate Statistics},
-    year      = {2009}
+    publisher = {Wiley}
 }
 
 @article{Xia2007,
     author   = {Xia, Yingcun},
-    doi      = {10.1214/009053607000000352},
-    fjournal = {The Annals of Statistics},
-    issn     = {0090-5364,2168-8966},
+    year     = {2007},
+    title    = {A constructive approach to the estimation of dimension reduction directions},
     journal  = {Ann. Statist.},
-    mrclass  = {62G08 (62G09 62H05)},
-    mrnumber = {2382662},
+    volume   = {35},
     number   = {6},
     pages    = {2654--2690},
-    title    = {A constructive approach to the estimation of dimension reduction directions},
-    url      = {https://doi.org/10.1198/016214508000000805},
-    volume   = {35},
-    year     = {2007}
+    issn     = {0090-5364,2168-8966},
+    doi      = {10.1214/009053607000000352},
+    fjournal = {The Annals of Statistics},
+    mrclass  = {62G08 (62G09 62H05)},
+    mrnumber = {2382662}
 }
 
 @article{Xia2008,
     author   = {Xia, Yingcun},
-    doi      = {10.1198/016214508000000805},
-    fjournal = {Journal of the American Statistical Association},
-    issn     = {0162-1459,1537-274X},
+    year     = {2008},
+    title    = {A multiple-index model and dimension reduction},
     journal  = {J. Amer. Statist. Assoc.},
-    mrclass  = {62E20 (62F10 62G05)},
-    mrnumber = {2504209},
+    volume   = {103},
     number   = {484},
     pages    = {1631--1640},
-    title    = {A multiple-index model and dimension reduction},
-    url      = {https://doi.org/10.1198/016214508000000805},
-    volume   = {103},
-    year     = {2008}
+    issn     = {0162-1459,1537-274X},
+    doi      = {10.1198/016214508000000805},
+    fjournal = {Journal of the American Statistical Association},
+    mrclass  = {62E20 (62F10 62G05)},
+    mrnumber = {2504209}
 }
 
 @article{XiaEtAl2002,
     author   = {Xia, Yingcun and Tong, Howell and Li, W. K. and Zhu, Li-Xing},
-    doi      = {10.1111/1467-9868.03411},
-    fjournal = {Journal of the Royal Statistical Society. Series B: Statistical Methodology},
-    issn     = {1369-7412},
+    year     = {2002},
+    title    = {{An Adaptive Estimation of Dimension Reduction Space}},
     journal  = {J. R. Stat. Soc. Ser. B Stat. Methodol.},
-    month    = {08},
+    volume   = {64},
     number   = {3},
     pages    = {363-410},
-    title    = {{An Adaptive Estimation of Dimension Reduction Space}},
-    volume   = {64},
-    year     = {2002}
+    issn     = {1369-7412},
+    doi      = {10.1111/1467-9868.03411},
+    fjournal = {Journal of the Royal Statistical Society. Series B: Statistical Methodology},
+    month    = {08}
 }
 
 @article{YangEtAl2004,
     author  = {Jian Yang and D. Zhang and A. F. Frangi and Jingyu Yang},
-    doi     = {10.1109/TPAMI.2004.1261097},
-    issn    = {0162-8828},
+    year    = {2004},
+    title   = {Two-dimensional {PCA}: a new approach to appearance-based face representation and recognition},
     journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+    volume  = {26},
     number  = {1},
     pages   = {131-137},
-    title   = {Two-dimensional {PCA}: a new approach to appearance-based face representation and recognition},
-    volume  = {26},
-    year    = {2004}
+    issn    = {0162-8828},
+    doi     = {10.1109/TPAMI.2004.1261097}
 }
 
 @article{Ye2005,
     author  = {Ye, Jieping},
-    day     = {01},
-    doi     = {10.1007/s10994-005-3561-6},
-    issn    = {1573-0565},
+    year    = {2005},
+    title   = {Generalized Low Rank Approximations of Matrices},
     journal = {Machine Learning},
+    volume  = {61},
     number  = {1},
     pages   = {167--191},
-    title   = {Generalized Low Rank Approximations of Matrices},
-    url     = {https://doi.org/10.1007/s10994-005-3561-6},
-    volume  = {61},
-    year    = {2005}
+    issn    = {1573-0565},
+    doi     = {10.1007/s10994-005-3561-6},
+    day     = {01}
 }
 
 @article{YeLim2016,
     author  = {Ye, Ke and Lim, Lek-Heng},
-    doi     = {10.1137/15M1054201},
+    year    = {2016},
+    title   = {Schubert Varieties and Distances between Subspaces of Different Dimensions},
     journal = {SIAM Journal on Matrix Analysis and Applications},
+    volume  = {37},
     number  = {3},
     pages   = {1176-1197},
-    title   = {Schubert Varieties and Distances between Subspaces of Different Dimensions},
-    volume  = {37},
-    year    = {2016}
+    doi     = {10.1137/15M1054201}
 }
 
 @inbook{Yin2010,
     author    = {Yin, Xiangrong},
-    booktitle = {High-Dimensional Data Analysis},
+    year      = {2010},
+    title     = {Sufficient Dimension Reduction in Regression},
     pages     = {257--273},
     publisher = {WORLD SCIENTIFIC / HIGHER EDUCATION PRESS, CHINA},
-    title     = {Sufficient Dimension Reduction in Regression},
     url       = {https://doi.org/10.1142/9789814324861_0009},
-    year      = {2010}
+    booktitle = {High-Dimensional Data Analysis}
 }
 
 @article{YinHilafu2015,
     author     = {Yin, Xiangrong and Hilafu, Haileab},
-    doi        = {10.1111/rssb.12093},
-    fjournal   = {Journal of the Royal Statistical Society. Series B. Statistical Methodology},
-    issn       = {1369-7412,1467-9868},
+    year       = {2015},
+    title      = {Sequential sufficient dimension reduction for large {$p$}, small {$n$} problems},
     journal    = {J. R. Stat. Soc. Ser. B. Stat. Methodol.},
-    mrclass    = {62H12},
-    mrnumber   = {3382601},
-    mrreviewer = {Santiago\ Velilla},
+    volume     = {77},
     number     = {4},
     pages      = {879--892},
-    title      = {Sequential sufficient dimension reduction for large {$p$}, small {$n$} problems},
-    url        = {https://doi.org/10.1111/rssb.12093},
-    volume     = {77},
-    year       = {2015}
+    issn       = {1369-7412,1467-9868},
+    doi        = {10.1111/rssb.12093},
+    fjournal   = {Journal of the Royal Statistical Society. Series B. Statistical Methodology},
+    mrclass    = {62H12},
+    mrnumber   = {3382601},
+    mrreviewer = {Santiago\ Velilla}
 }
 
 @article{YinLiCook2008,
     author   = {Yin, Xiangrong and Li, Bing and Cook, R. Dennis},
-    doi      = {10.1016/j.jmva.2008.01.006},
-    fjournal = {Journal of Multivariate Analysis},
-    issn     = {0047-259X,1095-7243},
+    year     = {2008},
+    title    = {Successive direction extraction for estimating the central subspace in a multiple-index regression},
     journal  = {J. Multivariate Anal.},
-    mrclass  = {62B05 (62H20)},
-    mrnumber = {2444817},
+    volume   = {99},
     number   = {8},
     pages    = {1733--1757},
-    title    = {Successive direction extraction for estimating the central subspace in a multiple-index regression},
-    url      = {https://doi.org/10.1016/j.jmva.2008.01.006},
-    volume   = {99},
-    year     = {2008}
+    issn     = {0047-259X,1095-7243},
+    doi      = {10.1016/j.jmva.2008.01.006},
+    fjournal = {Journal of Multivariate Analysis},
+    mrclass  = {62B05 (62H20)},
+    mrnumber = {2444817}
 }
 
 
 @article{YuBiYe2010,
     author  = {Shipeng Yu and Jinbo Bi and Jieping Ye},
-    journal = {Data Mining and Knowledge Discovery},
-    pages   = {372-392},
+    year    = {2010},
     title   = {Matrix-variate and higher-order probabilistic projections},
+    journal = {Data Mining and Knowledge Discovery},
     volume  = {22},
-    year    = {2010}
+    pages   = {372-392}
 }
 
 @article{ZengZhu2010,
     author   = {Peng Zeng and Yu Zhu},
-    doi      = {10.1016/j.jmva.2009.08.004},
-    fjournal = {Journal of Multivariate Analysis},
-    issn     = {0047-259X},
+    year     = {2010},
+    title    = {An integral transform method for estimating the central mean and central subspaces},
     journal  = {J. Multivariate Anal.},
+    volume   = {101},
     number   = {1},
     pages    = {271--290},
-    title    = {An integral transform method for estimating the central mean and central subspaces},
-    url      = {https://www.sciencedirect.com/science/article/pii/S0047259X0900147X},
-    volume   = {101},
-    year     = {2010}
+    issn     = {0047-259X},
+    doi      = {10.1016/j.jmva.2009.08.004},
+    fjournal = {Journal of Multivariate Analysis}
 }
 
 @article{ZhangLin2017,
     author    = {Xin Zhang and Lexin Li},
-    doi       = {10.1080/00401706.2016.1272495},
+    year      = {2017},
+    title     = {Tensor Envelope Partial Least-Squares Regression},
     journal   = {Technometrics},
+    volume    = {59},
     number    = {4},
     pages     = {426-436},
     publisher = {Taylor & Francis},
-    title     = {Tensor Envelope Partial Least-Squares Regression},
-    volume    = {59},
-    year      = {2017}
+    doi       = {10.1080/00401706.2016.1272495}
 }
 
 @article{ZhangZhou2005,
     author  = {Daoqiang Zhang and Zhi-Hua Zhou},
-    doi     = {10.1016/j.neucom.2005.06.004},
-    issn    = {0925-2312},
+    year    = {2005},
+    title   = {{(2D)2PCA}: Two-directional two-dimensional {PCA} for efficient face representation and recognition},
     journal = {Neurocomputing},
-    note    = {Neural Networks in Signal Processing},
+    volume  = {69},
     number  = {1},
     pages   = {224-231},
-    title   = {{(2D)2PCA}: Two-directional two-dimensional {PCA} for efficient face representation and recognition},
-    url     = {https://www.sciencedirect.com/science/article/pii/S0925231205001785},
-    volume  = {69},
-    year    = {2005}
+    issn    = {0925-2312},
+    doi     = {10.1016/j.neucom.2005.06.004},
+    note    = {Neural Networks in Signal Processing}
 }
 
 @article{ZhouEtAl2023,
     author    = {Jie Zhou, Will Wei Sun, Jingfei Zhang and Lexin Li},
-    doi       = {10.1080/01621459.2021.1938082},
+    year      = {2023},
+    title     = {Partially Observed Dynamic Tensor Response Regression},
     journal   = {Journal of the American Statistical Association},
+    volume    = {118},
     number    = {541},
     pages     = {424-439},
     publisher = {Taylor & Francis},
-    title     = {Partially Observed Dynamic Tensor Response Regression},
-    volume    = {118},
-    year      = {2023}
+    doi       = {10.1080/01621459.2021.1938082}
 }
 
 @article{ZhouLi2014,
     author    = {Zhou, Hua and Li, Lexin},
+    year      = {2014},
+    title     = {Regularized matrix regression},
     journal   = {Journal of the Royal Statistical Society. Series B (Statistical Methodology)},
+    volume    = {76},
     number    = {2},
     pages     = {463--483},
-    publisher = {[Royal Statistical Society, Wiley]},
-    title     = {Regularized matrix regression},
-    volume    = {76},
-    year      = {2014}
+    publisher = {[Royal Statistical Society, Wiley]}
 }
 
 @article{ZhouLiZhu2013,
     author  = {Zhou, H. and Li, L. and Zhu, H.},
-    issue   = {502},
-    journal = {Journal of the American Statistical Association},
-    pages   = {540-552},
+    year    = {2013},
     title   = {Tensor regression with applications in neuroimaging data analysis},
+    journal = {Journal of the American Statistical Association},
     volume  = {108},
-    year    = {2013}
+    pages   = {540-552},
+    issue   = {502}
 }
 
 @article{ZouChen2012,
     author  = {Zou, Changliang and Chen, Xin},
-    journal = {Journal of Multivariate Analysis},
-    number  = {C},
-    pages   = {248-255},
+    year    = {2012},
     title   = {On the consistency of coordinate-independent sparse estimation with BIC},
+    journal = {Journal of Multivariate Analysis},
     volume  = {112},
-    year    = {2012}
+    number  = {C},
+    pages   = {248-255}
 }
 
 
 
 @misc{lichess-database,
-    title  = {lichess.org open database},
-    author = {Thibault Duplessis},
+    author = {Duplessis, Thibault},
     year   = {2013},
-    note   = {visited on December 8, 2023},
-    url    = {https://database.lichess.org}
+    title  = {lichess.org open database},
+    url    = {https://database.lichess.org},
+    note   = {visited on December 8, 2023}
 }
 
 @misc{stockfish,
-    title     = {Stockfish},
     author    = {{The Stockfish developers (see \href{https://github.com/official-stockfish/Stockfish/blob/master/AUTHORS}{AUTHORS} file)}},
     year      = {since 2008},
+    title     = {Stockfish},
     note      = {Stockfish is a free and strong UCI chess engine. URL: \url{https://stockfishchess.org}},
     shorthand = {SF08},
     sortyear  = 2008
 }
+
+@misc{eeg-dataset,
+    author  = {Henri Begleiter},
+    year    = 1999,
+    title   = {EEG dataset},
+    url     = {http://kdd.ics.uci.edu/databases/eeg/eeg.data.html},
+    note    = {Donated by Lester Ingber}
+}
diff --git a/LaTeX/paper.tex b/LaTeX/paper.tex
index 7dff702..8919c38 100644
--- a/LaTeX/paper.tex
+++ b/LaTeX/paper.tex
@@ -13,7 +13,7 @@
 \usepackage{environ}                    % for dynamic TikZ picture scaling
 \usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms
 \usepackage[
-    style = apa,                % citation style
+    style = authoryear,         % citation style
     isbn = false,               % show isbn?
     maxbibnames = 50,           % maximal number of names in bibilography
     maxcitenames = 2,           % maximal number of names in text before et al.
@@ -28,8 +28,8 @@
     sortcites = false,
     natbib = false,
     dashed = true,
-    url = false,
-    doi = false,
+    url = true,
+    doi = true,
     bibencoding = utf8
 ]{biblatex}
 
@@ -272,9 +272,9 @@
 \maketitle
 
 \begin{abstract}
-    We consider supervised learning (regression/classification) problems where the independent variable is tensor-valued. We derive a multi-linear sufficient reduction for the regression or classification problem modeling the conditional distribution of the predictors given the response as a member of the quadratic exponential family. Using manifold theory, we prove the consistency and asymptotic normality of the sufficient reduction. We develop estimation procedures of
-    sufficient reductions for both continuous and binary tensor-valued predictors. For continuous predictors, the algorithm is highly computationally efficient and is also applicable to situations where the dimension of
-    the reduction exceeds the sample size. We demonstrate the superior performance of our approach in simulations and real-world data examples for both continuous and binary tensor-valued predictors. The \textit{Chess data} analysis results agree with a human player's understanding of the game and confirm the relevance of our approach. 
+    We consider supervised learning (regression/classification) problems where the independent variable is tensor valued. We derive a multi-linear sufficient reduction for the regression or classification problem modeling the conditional distribution of the predictors given the response as a member of the quadratic exponential family. Using manifold theory, we prove the consistency and asymptotic normality of the sufficient reduction. We develop estimation procedures of
+    sufficient reductions for both continuous and binary tensor valued predictors. For continuous predictors, the algorithm is highly computationally efficient and is also applicable to situations where the dimension of
+    the reduction exceeds the sample size. We demonstrate the superior performance of our approach in simulations and real-world data examples for both continuous and binary tensor valued predictors. The \textit{Chess data} analysis results agree with a human player's understanding of the game and confirm the relevance of our approach. 
 \end{abstract}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -283,61 +283,35 @@
 
 Tensors are a mathematical tool to represent data of complex structure in statistics. \textit{Tensors} are considered as a generalization of matrices to higher dimensions: A tensor is a multi-dimensional array of numbers. For example, a second-order tensor can be represented as a matrix, while a third-order tensor can be represented as a cube of matrices. 
 
-Complex data are collected at different times and/or under several conditions often involving a large number of multi-indexed variables represented as tensor-valued data \parencite{KoldaBader2009}. They occur in large-scale longitudinal studies \parencite[e.g.][]{Hoff2015}, in agricultural experiments and chemometrics and spectroscopy \parencite[e.g.][]{LeurgansRoss1992,Burdick1995}, in signal and video processing where sensors produce multi-indexed data, e.g. over spatial, frequency, and temporal dimensions \parencite[e.g.][]{DeLathauwerCastaing2007,KofidisRegalia2005}, and in telecommunications \parencite[e.g.][]{DeAlmeidaEtAl2007}. Other examples of multiway data include 3D images of the brain, where the modes are the 3 spatial dimensions, and spatio-temporal weather imaging data, a set of image sequences represented as 2 spatial modes and 1 temporal mode.
+Complex data are collected at different times and/or under several conditions often involving a large number of multi-indexed variables represented as tensor valued data \parencite{KoldaBader2009}. They occur in large-scale longitudinal studies \parencite[e.g.][]{Hoff2015}, in agricultural experiments and chemometrics and spectroscopy \parencite[e.g.][]{LeurgansRoss1992,Burdick1995}, in signal and video processing where sensors produce multi-indexed data, e.g. over spatial, frequency, and temporal dimensions \parencite[e.g.][]{DeLathauwerCastaing2007,KofidisRegalia2005}, and in telecommunications \parencite[e.g.][]{DeAlmeidaEtAl2007}. Other examples of multiway data include 3D images of the brain, where the modes are the 3 spatial dimensions, and spatio-temporal weather imaging data, a set of image sequences represented as 2 spatial modes and 1 temporal mode.
 
-% \begin{itemize}
-%     \item Review \cite{ZhouLiZhu2013} and see how you compare with them. They focus on the forward regression model with a scalar response but they claim that "Exploiting the array structure in imaging data, the new method substantially reduces the dimensionality of imaging data, which leads to efficient estimation and prediction."
-%     \item Read \cite{ZhouEtAl2023} to figure out the distribution they use for the tensor-valued predictors and briefly describe what they do.
-%     \item Read \cite{RabusseauKadri2016} to figure out what they do. They seem to draw both the response and the predictors from tensor-normal with iid N(0,1) entries: "In order to leverage the tensor structure of the output data, we formulate the problem as the minimization of a least squares criterion subject to a multilinear rank constraint on the regression tensor. The rank constraint enforces the model to capture low-rank structure in the outputs and to explain dependencies between inputs and outputs in a low-dimensional multilinear subspace."
-% \end{itemize}
+Tensor regression models have been proposed to leverage the structure inherent in tensor-valued data. For instance, \textcite{HaoEtAl2021,ZhouLiZhu2013} focus on tensor covariates, while \textcite{RabusseauKadri2016,LiZhang2017,ZhouLiZhu2013} focus on tensor responses, and \textcite{Hoff2015,Lock2018} consider  tensor on tensor regression. \textcite{HaoEtAl2021} modeled a scalar response as a flexible nonparametric function of tensor covariates. \textcite{ZhouLiZhu2013} assume the scalar response has a distribution in the exponential family given the tensor-valued predictors with the link modeled as a multilinear function of the predictors. \textcite{RabusseauKadri2016} model the tensor-valued response as a linear model with tensor-valued regression coefficients subject to a multilinear rank constraint. \textcite{LiZhang2017} approach the problem with a similar linear model but instead of a low-rank constraint the error term is assumed to have a separable Kronecker product structure while using a generalization of the envelope model \parencite{CookLiChiaromonte2010}. \textcite{ZhouEtAl2023} focus on partially observed tensor response given vector-valued predictors with mode-wise sparsity constraints in the regression coefficients. \textcite{Hoff2015} extends an existing bilinear regression model to a tensor on tensor of conformable modes and dimensions regression model based on a Tucker product. \textcite{Lock2018} uses a tensor contraction to build a penalized least squares model for a tensor with arbitrary number of modes and dimensions.
 
-% - RabusseauKadri2016  Y | x for tensor Y with vector x (HOLRR; Higher Order Low-Rank Regression)
-% - LiZhang2017         Y | x for tensoe Y with vector x (envelope model)
-% - ZhouEtAl2023        Y | x for tensor Y with vector x (sparse and partially observed)
+Our approach considers the general regression problem of fitting a response of general form (univariate, multivariate, tensor-valued) on a tensor-value predictor. We operate in the context of sufficient dimension reduction \parencite[e.g.]{Cook1998,Li2018} based on inverse regression, which leads us to regressing the tensor valued predictor on the response. In our setting, this necessitates transforming the response to tensor-valued functions, regardless of whether it is itself tensor valued. Because of the setting, our method shares commonalities with the tensor regression models referred to above, yet the modeling and methodology are novel. Specifically, our tensor-to-tensor regression model is a generalized multi-linear model similar to the generalized linear model of \parencite{ZhouLiZhu2013}. To bypass the explosion of the number of parameters to estimate, we assume the inverse regression error covariance has Kronecker product structure as do \textcite{LiZhang2017}. Our maximum likelihood-based estimation does not require any penalty terms in contrast to the least squares and/or sparse approaches \parencite{ZhouLiZhu2013}. In the case of a tensor (multilinear) normal, given the tensor valued function of the response, our model exhibits similarities to the multilinear modeling of \textcite{Hoff2015}, but we use a generalized multilinear model and estimate the parameters with maximum likelihood instead of least squares. Moreover, a common issue in multilinear tensor regression models is the unidentifiability of the parameters, which we address in a completely different manner. For example, \textcite{LiZhang2017} developed theory that is based on orthogonal projection matrices to uniquely identify a subspace, while our approach is more general as it uses manifold theory. 
 
-% - ZhouLiZhu2013       y\in R (GLM) for y | Z, X for tensor X
-% - HaoEtAl2021         y\in R for y | X for tensor X (sparse, element wise B-splines)
+In this paper, we present a model-based \emph{Sufficient Dimension Reduction} (SDR) method for tensor-valued data with distribution in the quadratic exponential family assuming a separable Kronecker product structure of the first and second moment. The quadratic exponential family contains the multi-linear normal and the multi-linear Ising distributions, for continuous and binary tensor-valued random variables, respectively.  By generalizing the parameter space to embedded manifolds we %obtain consistency and asymptotic normality results while 
+allow great modeling flexibility in the sufficient dimension reduction.
 
-% Tensor regression models have been proposed to exploit the special structure of tensor covariates, e.g.  \cite{HaoEtAl2021,ZhouLiZhu2013}, or tensor responses \cite{RabusseauKadri2016,LiZhang2017,ZhouEtAl2023} \cite{HaoEtAl2021} modeled a scalar response as a flexible nonparametric function of tensor covariates. \cite{ZhouLiZhu2013} assume the scalar response has a distribution in the exponential family given the tensor-valued predictors and model the link function as a multilinear function of the predictors. \cite{LiZhang2017} model the tensor-valued response as tensor normal. Rather than using $L_1$ type penalty functions to induce sparsity, they employ the envelope method (Cook, Li, and Chiaromonte Citation2010) to estimate the unknown regression coefficient. Moreover, the envelope method essentially identifies and uses the material information jointly. They develop an estimation algorithm and study the asymptotic properties of the estimator. the scalar response as  These models try to utilize the sparse and low-rank structures in the tensors -- either in the regression coefficient tensor or the response tensor -- to boost performance on the regression task by reducing the number of free parameters.
-
-Tensor regression models have been proposed to leverage the structure inherent in tensor valued data. For instance, \textcite{HaoEtAl2021,ZhouLiZhu2013} focus on tensor covariates, while \textcite{RabusseauKadri2016,LiZhang2017,ZhouLiZhu2013} focus on tensor responses, and \textcite{Hoff2015,Lock2018} consider  tensor on tensor regression. \textcite{HaoEtAl2021} modeled a scalar response as a flexible nonparametric function of tensor covariates. \textcite{ZhouLiZhu2013} assume the scalar response has a distribution in the exponential family given the tensor-valued predictors with the link modeled as a multilinear function of the predictors. \textcite{RabusseauKadri2016} model the tensor-valued response as a linear model with tensor valued regression coefficients subject to a multilinear rank constraint. \textcite{LiZhang2017} approach the problem with a similar linear model but instead of a low rank constraint the error term is assumed to have a separable Kronecker product structure while using a generalization of the envelope model \parencite{CookLiChiaromonte2010}. \textcite{ZhouEtAl2023} focus on partially observed tensor response given vector-valued predictors with mode-wise sparsity constraints in the regression coefficients. \textcite{Hoff2015} extends an existing bilinear regression model to a tensor on tensor of conformable modes and dimensions regression model based on a Tucker product. \textcite{Lock2018} uses a tensor contraction to build a penalized least squares model for a tensor with arbitrary number of modes and dimensions.
-
-Our approach considers the general regression problem of fitting a response of general form (univariate, multivariate, tensor-valued) on a tensor-value predictor. We operate in the context of sufficient dimension reduction \parencite[e.g.]{Cook1998,Li2018} based on inverse regression, which leads us to regressing the tensor-valued predictor on the response. In our setting, this necessitates transforming the response to tensor-valued functions, regardless of whether it is itself tensor-valued. Because of the setting, our method shares commonalities with the tensor regression models referred to above, yet the modeling and methodology are novel. 
-Specifically, our tensor-to-tensor regression model is a generalized multi-linear model similar to the generalized linear model of \parencite{ZhouLiZhu2013}. % but with tensor valued response by applying (a known) tensor valued function to the response in an inverse regression setting, reversing the role of response and predictors. 
-To bypass the explosion of the number of parameters to estimate, we assume the inverse regression error covariance has Kronecker product structure as do \textcite{LiZhang2017}. Our maximum likelihood-based estimation does not require any penalty terms in contrast to the least squares and/or sparse approaches \parencite{ZhouLiZhu2013}. In the case of a tensor (multilinear) normal, given the tensor-valued function of the response, our model exhibits similarities to the multilinear modeling of \textcite{Hoff2015}, but we use a generalized multilinear model and estimate the parameters with maximum likelihood instead of least squares. Moreover, a common issue in multilinear tensor regression models is the unidentifiability of the parameters, which we address in a completely different manner. For example, \textcite{LiZhang2017} develop theory that is based on orthogonal projection matrices to uniquely identify a subspace, while our approach is more general as it uses manifold theory. 
-
-
-In this paper, we present a model-based \emph{Sufficient Dimension Reduction} (SDR) method for tensor-valued data with distribution in the quadratic exponential family assuming a separable Kronecker product structure of the first and second moment. By generalizing the parameter space to embedded manifolds we obtain consistency and asymptotic normality results while allowing great modeling flexibility in the linear sufficient dimension reduction.
-
-The quadratic exponential family contains the tensor normal and the tensor Ising distributions, for continuous and binary tensor-valued random variables, respectively. 
 
 Multi-linear normal models have been used in various applications, including medical imaging \parencite{BasserPajevic2007,DrydenEtAl2009}, spatio-temporal data analysis \parencite{GreenewaldHero2014}, regression analysis for longitudinal relational data \parencite{Hoff2015}. One of the most important uses of the multi-linear normal (MLN) distribution, and hence tensor analysis, is perhaps in magnetic resonance imaging (MRI) \parencite{OhlsonEtAl2013}. A recent survey \parencite{WangEtAl2022} and references therein contain more information and potential applications of multilinear tensor normal models.
 
-The Ising\footnote{Also known as the \emph{Lenz-Ising} model as the physical assumptions of the model where developed by both Lenz and Ising \parencite{Niss2005} where Ising gave a closed form solution for the 1D lattice \parencite{Ising1925}.} model \parencite{Lenz1920,Ising1925,Niss2005} is a mathematical model originating in statistical physics to study ferromagnetism in a thermodynamic setting. It describes magnetic dipoles (atomic ``spins'' with values $\pm 1$) under an external magnetic field (first moments) while allowing two-way interactions (second moments) between direct neighbours on a lattice, a discrete grid. The Ising problem, as known in statistical physics, is to compute observables such as the magnetizations and correlations under the Boltzmann distribution\footnote{The Boltzmann distribution is a probability distribution over the states of a physical system in thermal equilibrium (constant temperature) that assigns higher probabilities to states with lower energy.} while the interaction structure and the magnetic fields are given. The ``reverse'' problem, where the couplings and fields are unknown and to be determined from observations of the spins, as in statistical inference, is known as the \emph{inverse Ising problem} \parencite{NguyenEtAl2017}. From this point of view, the Ising model is a member of a discrete quadratic exponential family \parencite{CoxWermuth1994,JohnsonEtAl1997} for multivariate binary outcomes where the interaction structure (non-zero correlations) is determined by the lattice. Generally, neither the values of couplings nor the interaction structure are known.
+The Ising\footnote{Also known as the \emph{Lenz-Ising} model as the physical assumptions of the model where developed by both Lenz and Ising \parencite{Niss2005} where Ising gave a closed form solution for the 1D lattice \parencite{Ising1925}.} model \parencite{Lenz1920,Ising1925,Niss2005} is a mathematical model originating in statistical physics to study ferromagnetism in a thermodynamic setting. It describes magnetic dipoles (atomic ``spins'' with values $\pm 1$) under an external magnetic field (first moments) while allowing two-way interactions (second moments) between direct neighbors on a lattice, a discrete grid. The Ising problem, as known in statistical physics, is to compute observables such as the magnetizations and correlations under the Boltzmann distribution\footnote{The Boltzmann distribution is a probability distribution over the states of a physical system in thermal equilibrium (constant temperature) that assigns higher probabilities to states with lower energy.} while the interaction structure and the magnetic fields are given. The ``reverse'' problem, where the couplings and fields are unknown and to be determined from observations of the spins, as in statistical inference, is known as the \emph{inverse Ising problem} \parencite{NguyenEtAl2017}. From this point of view, the Ising model is a member of a discrete quadratic exponential family \parencite{CoxWermuth1994,JohnsonEtAl1997} for multivariate binary outcomes where the interaction structure (non-zero correlations) is determined by the lattice. Generally, neither the values of couplings nor the interaction structure are known.
 
-In consequence, the Ising model is mostly used to model multivariate binary data in statistics. The states are ${0, 1}$ instead of $\pm 1$, and full interaction structure. It is related to a multitude of other models, among which the most prominent are: \emph{Graphical Models} and \emph{Markov Random Fields} to describe conditional dependence \parencite{Lauritzen1996,WainwrightJordan2008,LauritzenRichardson2002}, \emph{Potts models} \parencite{Besag1974,ChakrabortyEtAl2022} which generalize the Ising model to multiple states, the \emph{multivariate Bernoulli distribution} \parencite{Whittaker1990,JohnsonEtAl1997,DaiDingWahba2013} that also accommodates higher-order interactions (three-way and higher), \emph{(restricted) Botlzmann machines} \parencite{Smolensky1986,Hinton2002,FischerIgel2012} that introduce additional hidden variables for learning binary distributions. Most of these models can be used both in supervised and unsupervised settings. 
-Applications of the Ising model (and variations thereof) range from modeling neural firing patterns \parencite{SchneidmanEtAl2006}, gene expression data analysis \parencite{LezonEtAl2006}, and modeling financial markets \parencite{Bury2013}. See also \textcite{NguyenEtAl2017}.
+In consequence, the Ising model is mostly used to model multivariate binary data in statistics. The states are ${0, 1}$ instead of $\pm 1$, and full interaction structure. It is related to a multitude of other models, among which the most prominent are: \emph{Graphical Models} and \emph{Markov Random Fields} to describe conditional dependence \parencite{Lauritzen1996,WainwrightJordan2008,LauritzenRichardson2002}, \emph{Potts models} \parencite{Besag1974,ChakrabortyEtAl2022} which generalize the Ising model to multiple states, the \emph{multivariate Bernoulli distribution} \parencite{Whittaker1990,JohnsonEtAl1997,DaiDingWahba2013} that also accommodates higher-order interactions (three-way and higher), \emph{(restricted) Botlzmann machines} \parencite{Smolensky1986,Hinton2002,FischerIgel2012} that introduce additional hidden variables for learning binary distributions. Most of these models can be used both in supervised and unsupervised settings. Applications of the Ising model (and variations thereof) range from modeling neural firing patterns \parencite{SchneidmanEtAl2006}, gene expression data analysis \parencite{LezonEtAl2006}, and modeling financial markets \parencite{Bury2013}. See also \textcite{NguyenEtAl2017}.
 
-The $r$-tensor Ising model in statistical physics is a generalization of the Ising model to $r$-order interactions. \textcite{MukherjeeEtAl2020} study the one-parameter discrete exponential family for modeling dependent binary data where the interaction structure is given. In \textcite{LiuEtAl2023} the tensor structure itself is to be inferred. These models are fundamentally different from our approach where we rely on properties of the quadratic exponential family which models up to second-order interactions. Another important difference is that we adopt the multi-linear formulation as it is inherently linked to the observable structure of multi-way data as opposed to describing the model coefficients with an $r$-order tensor structure.  
+The $r$-tensor Ising model in statistical physics is a generalization of the Ising model to $r$-order interactions. \textcite{MukherjeeEtAl2022} study the one-parameter discrete exponential family for modeling dependent binary data where the interaction structure is given. In \textcite{LiuEtAl2023} the tensor structure itself is to be inferred. These models are fundamentally different from our approach where we rely on properties of the quadratic exponential family which models up to second-order interactions. Another important difference is that we adopt the multi-linear formulation as it is inherently linked to the observable structure of multi-way data as opposed to describing the model coefficients with an $r$-order tensor structure.  
 
-% \textcite{LiuEtAl2023,MukherjeeEtAl2020,ChengEtAl2014,Habeck2014}
+Our main contributions are (a) formulating the dimension reduction problem via the quadratic exponential family, which allows us to derive the sufficient dimension reduction in closed form, (b) defining the parameter space as an embedded manifold, which provides great flexibility in modeling, (c) deriving the maximum likelihood estimator of the sufficient reduction subject to multi-linear constraints and overcoming parameter non-identifiability, (d) developing estimation algorithms which in the case of multi-linear normal predictors is fast and efficient, and (e) establishing the consistency and asymptotic normality of the estimators.
 
-% The Ising model for multivariate binary outcomes belongs to the class of discrete exponential families. Its defining feature is that the sufficient statistic involves a quadratic term to capture correlations arising from pairwise interactions.
-% The tensor Ising model is a higher-order Ising model for tensor-valued binary outcomes. 
-% %From \cite{MukherjeeEtAl2020}
-% Higher-order Ising models arise naturally in the study of multi-atom interactions in lattice gas models, such as the square-lattice eight-vertex model, the Ashkin-Teller model, and Suzuki's pseudo-3D anisotropic model (cf. [6, 33, 36, 37, 49, 55, 56, 61, 62] and the references therein). More recently, higher-order spin systems have also been proposed for modeling peer-group effects in social networks [22].  \efi{Daniel: comment on what these guys do and contrast with your setting} In our approach, the parameter is not constrained to be scalar
-% We derive maximum likelihood estimates for all first and second order interactions and propose a gradient-based optimization algorithm. 
+Even though our motivation is rooted in the SDR perspective, our proposal concerns inference on any regression model with a tensor valued response and predictors of any type. Thus, our approach can be used as a stand-alone model for such data regardless of whether one is interested in deriving sufficient reductions and/or reducing the dimension of the data. Our results in the framework of the quadratic exponential family for tensor valued variables; i.e., consistency and asymptotic normality, apply to both multi-linear normal \parencite{KolloVonRosen2005,Hoff2011,OhlsonEtAl2013} and multi-linear Ising models, as defined in  \cref{sec:ising_estimation}.
 
-As an aside, even though our motivation stems from the SDR perspective, our proposal concerns inference on any regression model with a tensor-valued response and any type of predictors. Thus, our approach can be used as a stand-alone model for such data regardless of whether one is interested in deriving sufficient reductions and/or reducing the dimension of the data.  Our results in the framework of the quadratic exponential family for tensor-valued variables; i.e., consistency and asymptotic normality, apply to both multi-linear normal \efi{\ref{?} and multi-linear Ising models, as defined in this paper in Sec. ??.} 
+The structure of this paper is as follows. We introduce our notation in \cref{sec:notation}. \Cref{sec:problem-formulation} details the problem we consider and in \cref{sec:gmlm-model} we introduce our model. Continuing in \cref{sec:ml-estimation} we provide the basis for a general maximum likelihood estimation procedure and derive specialized methods for multi-linear normal as well as the multi-linear Ising distributions. \Cref{sec:manifolds} gives a short introduction into manifolds and provides the basis for applying the consistency and asymtotic normality results from \cref{sec:asymtotics}. Simulations for continuous and binary predictors are the subject of \cref{sec:simulations}. Finally, in \cref{sec:data-analysis} we apply our model to EEG data and perform a proof of concept data analysis where a chess board is interpreted as a collection of binary $8\times 8$ matrices.
 
 
-
-The structure of this paper is as follows. We introduce our notation in \cref{sec:notation}. \Cref{sec:problem-formulation} details the problem we consider and in \cref{sec:gmlm-model} we introduce our model. Continuing in \cref{sec:ml-estimation} we provide the basis for a general maximum likelihood estimation procedure and derive specialized methods for tensor normal as well as the tensor Ising distributions. \Cref{sec:manifolds} gives a short introduction into manifolds and provides the basis for applying the consistency and asymtotic normality results from \cref{sec:asymtotics}. Simulations for continuous and binary predictors are the subject of \cref{sec:simulations}. Finally, in \cref{sec:data-analysis} we apply our model to EEG data and perform a prove of concept data analysis example where a chess board is interpreted as a collection of binary $8\times 8$ matrices.
-
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Notation}\label{sec:notation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-Let $\ten{A}\in\mathbb{R}^{q_1\times \ldots\times q_r}$ denotes an order\footnote{Also referred to as rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the concept of rank of a matrix.} $r$ tensor, where $r\in\mathbb{N}$ is the number of modes or axes (dimensions) of $\ten{A}$ and $\ten{A}_{i_1,...,i_r} \in \mathbb{R}$ is its $(i_1, \ldots, i_r)$th entry. For example, a $p \times q$ matrix  $\mat{B}$ has two modes, the rows and columns.  For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times q_k}$, $k\in[r] = \{1, 2, \ldots, r\}$, the \emph{multi-linear multiplication}, or \emph{Tucker operator} \parencite{Kolda2006}, is defined element wise as
+Let $\ten{A}\in\mathbb{R}^{q_1\times \ldots\times q_r}$ denotes an order\footnote{Also referred to as rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the concept of rank of a matrix.} $r$ tensor, where $r\in\mathbb{N}$ is the number of modes or axes (dimensions) of $\ten{A}$ and $\ten{A}_{i_1,...,i_r} \in \mathbb{R}$ is its $(i_1, \ldots, i_r)$th entry. For example, a $p \times q$ matrix  $\mat{B}$ has two modes, rows and columns.  For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times q_k}$, $k\in[r] = \{1, 2, \ldots, r\}$, the \emph{multi-linear multiplication}, or \emph{Tucker operator} \parencite{Kolda2006}, is defined element-wise as
 \begin{displaymath}
     (\ten{A}\times\{\mat{B}_1, \ldots, \mat{B}_r\})_{j_1, \ldots, j_r} = \sum_{i_1, \ldots, i_r = 1}^{q_1, \ldots, q_r} \ten{A}_{i_1, \ldots, i_r}(\mat{B}_{1})_{j_1, i_1} \cdots (\mat{B}_{r})_{j_r, i_r}
 \end{displaymath}
@@ -345,25 +319,24 @@ which results in an order $r$ tensor of dimension $p_1\times ...\times p_k$. The
 \begin{displaymath}
     \ten{A}\times_k\mat{B}_k = \ten{A}\times\{\mat{I}_{q_1}, \ldots, \mat{I}_{q_{k-1}}, \mat{B}_{k}, \mat{I}_{q_{k+1}}, \ldots, \mat{I}_{q_r}\}.
 \end{displaymath}
-The notation $\ten{A}\mlm_{k\in S}\mat{B}_k$ is short hand for the iterative application of the mode product for all indices in $S\subseteq[r]$. For example $\ten{A}\mlm_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set, this notation is unambiguous because the mode product commutes for different modes; i.e., $\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$ for $j\neq k$. For example, let $\mat{A}, \mat{B}_1, \mat{B}_2$ be matrices (of matching dimensions). The bilinear mode-product and multi-linear multiplication relate to the well known matrix-matrix multiplications as
+The notation $\ten{A}\mlm_{k\in S}\mat{B}_k$ is shorthand for the iterative application of the mode product for all indices in $S\subseteq[r]$. For example $\ten{A}\mlm_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set, this notation is unambiguous because the mode product commutes for different modes; i.e., $\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$ for $j\neq k$. For example, let $\mat{A}, \mat{B}_1, \mat{B}_2$ be matrices (of matching dimensions). The bilinear mode-product and multi-linear multiplication relate to the well-known matrix-matrix multiplications as
 \begin{displaymath}
     \mat{A}\times_1\mat{B}_1 = \mat{B}_1\mat{A}, \qquad
     \mat{A}\times_2\mat{B}_2 = \mat{A}\t{\mat{B}_2}, \qquad
     \mat{A}\mlm_{k = 1}^2\mat{B}_k = \mat{A}\mlm_{k \in \{1, 2\}}\mat{B}_k = \mat{B}_1\mat{A}\t{\mat{B}_2}.
 \end{displaymath}
 
-%Matrices and tensors can be \emph{vectorized} by the \emph{vectorization} operator $\vec$. 
 The operator $\vec$ maps an array to a vector. Specifically, $\vec(\mat{B})$ stands for the $pq \times 1$ vector of the $p \times q$ matrix $\mat{B}$ resulting from stacking the columns of $\mat{B}$ one after the other. For a tensor $\ten{A}$ of order $r$ and dimensions $q_1, \ldots, q_r$, $\vec(\ten{A})$ is the $q_1 q_2 \ldots q_r \times 1$ vector with the elements of $\ten{A}$ stacked one after the other in the order $r$ then $r-1$, and so on. For example, if $\ten{A}$ is a 3-dimensional array, $\vec(\ten{A})=\t{(\t{\vec(\ten{A}_{:,:,1})},\t{\vec(\ten{A}_{:,:,2})},\ldots,\t{\vec(\ten{A}_{:,:,q_3})})}$. We use the notation $\ten{A}\equiv \ten{B}$ for objects $\ten{A}, \ten{B}$ of any shape if and only if $\vec(\ten{A}) = \vec(\ten{B})$.
 
 The \emph{inner product} between two tensors of the same order and dimensions is
 \begin{displaymath}
     \langle\ten{A}, \ten{B}\rangle = \sum_{i_1, \ldots, i_r} \ten{A}_{i_1, \ldots, i_r}\ten{B}_{i_1, \ldots, i_r}
 \end{displaymath}
-This leads to the definition of the \emph{Frobenius norm} for tensors, $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$ and is the straightforward  extension of the Frobenius norm for matrices and vectors. The \emph{outer product} between two tensors $\ten{A}$ of dimensions $q_1, \ldots, q_r$ and $\ten{B}$ of dimensions $p_1, \ldots, p_l$ is a tensor $\ten{A}\circ\ten{B}$ of order $r + l$ and dimensions $q_1, \ldots, q_r, p_1, \ldots, p_l$, such that
+This leads to the definition of the \emph{Frobenius norm} for tensors, $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$ and is the straightforward extension of the Frobenius norm for matrices and vectors. The \emph{outer product} between two tensors $\ten{A}$ of dimensions $q_1, \ldots, q_r$ and $\ten{B}$ of dimensions $p_1, \ldots, p_l$ is a tensor $\ten{A}\circ\ten{B}$ of order $r + l$ and dimensions $q_1, \ldots, q_r, p_1, \ldots, p_l$, such that
 \begin{displaymath}
     \ten{A}\circ\ten{B} \equiv (\vec\ten{A})\t{(\vec{\ten{B}})}.
 \end{displaymath}
-Let $\ten{K} : \mathbb{R}^{q_1, ..., q_{2 r}}\to\mathbb{R}^{q_1 q_{r + 1}, ..., q_r q_{2 r}}$ be defined element wise with indices $1\leq i_j + 1\leq q_j q_{r + j}$ for $j = 1, ..., r$ as
+Let $\ten{K} : \mathbb{R}^{q_1, ..., q_{2 r}}\to\mathbb{R}^{q_1 q_{r + 1}, ..., q_r q_{2 r}}$ be defined element-wise with indices $1\leq i_j + 1\leq q_j q_{r + j}$ for $j = 1, ..., r$ as
 \begin{align*}
     \ten{K}(\ten{A})_{i_1 + 1, ..., i_r + 1} &= \ten{A}_{\lfloor i_1 / q_{r + 1}\rfloor + 1, ..., \lfloor i_r / q_{2 r} \rfloor + 1, (i_1\operatorname{mod}q_{r + 1}) + 1, ..., (i_r\operatorname{mod}q_{2 r}) + 1}
 \end{align*}
@@ -392,7 +365,7 @@ where the vectorized quantities $\vec{\ten{X}}\in\mathbb{R}^p$ and $\vec\ten{F}(
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Problem Formulation}\label{sec:problem-formulation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-Our goal is to infer the cumulative distribution function (cdf) $F$ of $Y\mid \ten{X}$, where $\ten{X}$ is assumed to admit $r$-tensor structure of dimension $p_1\times ... \times p_r$ with continuous or discrete entries, and the response  $Y$ is unconstrained. The predictor $\ten{X}$ is a complex object; to simplify the problem we assume there exists a tensor-valued function of lower dimension $\ten{R}:\ten{X}\mapsto \ten{R}(\ten{X})$ such that 
+Our goal is to infer the cumulative distribution function (cdf) $F$ of $Y\mid \ten{X}$, where $\ten{X}$ is assumed to admit $r$-tensor structure of dimension $p_1\times ... \times p_r$ with continuous or discrete entries, and the response  $Y$ is unconstrained. The predictor $\ten{X}$ is a complex object; to simplify the problem we assume there exists a tensor valued function of lower dimension $\ten{R}:\ten{X}\mapsto \ten{R}(\ten{X})$ such that 
 \begin{displaymath}
     F(Y\mid \ten{X}) = F(Y\mid \ten{R}(\ten{X})).
 \end{displaymath}
@@ -412,13 +385,13 @@ f_{\mat{\eta}_y}(\ten{X}\mid Y = y)
     &= h(\ten{X})\exp(\t{\mat{\eta}_y}\mat{t}(\ten{X}) - b(\mat{\eta}_y)) \nonumber \\
     &= h(\ten{X})\exp(\langle \mat{t}_1(\ten{X}), \mat{\eta}_{1y} \rangle + \langle \mat{t}_2(\ten{X}), \mat{\eta}_{2y} \rangle - b(\mat{\eta}_{y})) \label{eq:quad-density}
 \end{align}
-where $\mat{t}_1(\ten{X})=\vec \ten{X}$ and $\mat{t}_2(\ten{X})$ is linear in $\ten{X}\circ\ten{X}$. The dependence of $\ten{X}$ on $Y$ is fully captured in the natural parameter $\mat{\eta}_y$. The function $h$ is non-negative real-valued and $b$ is assumed to be at least twice continuously differentiable and strictly convex. An important feature of the \emph{quadratic exponential family} is that the distribution of its members is fully characterized by their first two moments. Distributions within the quadratic exponential family include the \emph{tensor normal} (\cref{sec:tensor-normal-estimation}) and \emph{tensor Ising model} (\cref{sec:ising_estimation}, a generalization of the (inverse) Ising model which is a multi-variate Bernoulli with up to second order interactions) and mixtures of these two. 
+where $\mat{t}_1(\ten{X})=\vec \ten{X}$ and $\mat{t}_2(\ten{X})$ is linear in $\ten{X}\circ\ten{X}$. The dependence of $\ten{X}$ on $Y$ is fully captured in the natural parameter $\mat{\eta}_y$. The function $h$ is non-negative real-valued and $b$ is assumed to be at least twice continuously differentiable and strictly convex. An important feature of the \emph{quadratic exponential family} is that the distribution of its members is fully characterized by their first two moments. Distributions within the quadratic exponential family include the \emph{multi-linear normal} (\cref{sec:tensor-normal-estimation}) and \emph{multi-linear Ising model} (\cref{sec:ising_estimation}, a generalization of the (inverse) Ising model which is a multi-variate Bernoulli with up to second order interactions) and mixtures of these two. 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{The Generalized Multi-Linear Model}\label{sec:gmlm-model}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-In model \eqref{eq:quad-density}, the dependence of $\ten{X}$ and $Y$ is absorbed in $\mat{\eta}_y$, and $\mat{t}(\ten{X})$ is the minimal sufficient statistic for the \textit{pseudo}-parameter\footnote{$\mat{\eta}_y$ is a function of the response $Y$, thus it is not a parameter in the formal statistical sense. It is considered as a parameter when using the equivalence in \eqref{eq:inverse-regression-sdr} and view  $Y$ as a parameter as a device to derive the sufficient reduction from the inverse regression.} $\mat{\eta}_y = (\mat{\eta}_{1y}, \mat{\eta}_{2y})$ with
+In model \eqref{eq:quad-density}, the dependence of $\ten{X}$ and $Y$ is absorbed in $\mat{\eta}_y$, and $\mat{t}(\ten{X})$ is the minimal sufficient statistic for the \textit{pseudo}-parameter\footnote{$\mat{\eta}_y$ is a function of the response $Y$, so it is not a parameter in the formal statistical sense. We view it as a parameter in order to leverage \eqref{eq:inverse-regression-sdr} and derive the sufficient reduction from the inverse regression.} $\mat{\eta}_y = (\mat{\eta}_{1y}, \mat{\eta}_{2y})$ with
 \begin{align}\label{eq:t-stat}
     \mat{t}(\ten{X}) &= (\mat{t}_1(\ten{X}),\mat{t}_2(\ten{X}))=(\vec{\ten{X}}, \mat{T}_2\vech((\vec\ten{X})\t{(\vec\ten{X})})),
 \end{align}
@@ -432,7 +405,7 @@ where $\mat{D}_p$ is the \emph{duplication matrix} from \textcite[Ch.~11]{Abadir
 \begin{equation}\label{eq:quadratic-exp-fam}
     f_{\eta_y}(\ten{X}\mid Y = y) = h(\ten{X})\exp\left(\langle \vec \ten{X}, \mat{\eta}_{1y} \rangle + \langle \vec(\ten{X}\circ\ten{X}), \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y} \rangle - b(\mat{\eta}_y)\right)
 \end{equation}
-The exponential family in \eqref{eq:quadratic-exp-fam} is easily generalizable to any order. This, though, would result in the number of parameters becoming prohibitive to estimate, which is also the reason why we opted for the second order exponential family in our formulation.
+The exponential family in \eqref{eq:quadratic-exp-fam} is easily generalizable to any order. This, though, would result in the number of parameters becoming prohibitive to estimate, which is also the reason why we opted for the second-order exponential family in our formulation.
 
 By the equivalence in \eqref{eq:inverse-regression-sdr}, in order to find the sufficient reduction $\ten{R}(\ten{X})$ we need to infer $\mat{\eta}_{1y}$, and $\mat{\eta}_{2y}$. This is reminiscent of generalized linear modeling, which we extend to a multi-linear formulation next.
 Suppose $\ten{F}_y$ is a known mapping of $y$ with zero expectation $\E_Y\ten{F}_Y = 0$. We assume the dependence of $\ten{X}$ and $Y$ is reflected only in the first parameter and let
@@ -440,7 +413,7 @@ Suppose $\ten{F}_y$ is a known mapping of $y$ with zero expectation $\E_Y\ten{F}
     \mat{\eta}_{1y} &= \vec{\overline{\ten{\eta}}} + \mat{B}\vec\ten{F}_y, \label{eq:eta1-manifold} \\
     \mat{\eta}_{2}  &= \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}\vec(c\,\mat{\Omega}), \label{eq:eta2-manifold}
 \end{align}
-where $\overline{\ten{\eta}}\in\mathbb{R}^{p_1\times\ldots\times p_r}$, $\mat{\Omega} \in \mathbb{R}^{p \times p}$ is positive definite with $p = \prod_{j = 1}^{r} p_j$, and $c\in\mathbb{R}$ is a known constant determined by the distribution to ease modeling. That is, we assume that only $\mat{\eta}_{1y}$ depends on $Y$ through $\mat{B}$. The second parameter $\mat{\eta}_2$ captures the second order interaction structure of $\ten{X}$, which we assume not to depend on the response $Y$. In order to relate individual modes of $\ten{X}$ to the response, allowing flexibility in modeling, we assume $\ten{F}_y$ takes values in $\mathbb{R}^{q_1\times ...\times q_r}$; that is, $\ten{F}_y$ is a tensor valued independent variable. This, in turn, leads to imposing corresponding tensor structure to the regression parameter $\mat{B}$. Thus, \eqref{eq:eta1-manifold} becomes
+where $\overline{\ten{\eta}}\in\mathbb{R}^{p_1\times\ldots\times p_r}$, $\mat{\Omega} \in \mathbb{R}^{p \times p}$ is positive definite with $p = \prod_{j = 1}^{r} p_j$, and $c\in\mathbb{R}$ is a known constant determined by the distribution to ease modeling. That is, we assume that only $\mat{\eta}_{1y}$ depends on $Y$ through $\mat{B}$. The second parameter $\mat{\eta}_2$ captures the second order interaction structure of $\ten{X}$, which we assume not to depend on the response $Y$. To relate individual modes of $\ten{X}$ to the response, allowing flexibility in modeling, we assume $\ten{F}_y$ takes values in $\mathbb{R}^{q_1\times ...\times q_r}$; that is, $\ten{F}_y$ is a tensor valued independent variable. This, in turn, leads to imposing a corresponding tensor structure to the regression parameter $\mat{B}$. Thus, \eqref{eq:eta1-manifold} becomes
 \begin{align}
     \mat{\eta}_{1y} &=
     \vec\biggl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\biggr), \label{eq:eta1}
@@ -489,11 +462,9 @@ The reduction in vectorized form is $\vec\ten{R}(\ten{X})=\t{\mat{B}}\vec(\ten{X
     \begin{align*}
         f_{\theta}(\mat{x}\mid Y = y)
             &= h(\mat{x})\exp(\langle\mat{x}, \mat{\eta}_{1y}(\mat{\theta})\rangle + \langle\vec(\mat{x}\circ\mat{x}), \mat{\eta}_2(\mat{\theta})\rangle - b(\mat{\eta}_y(\mat{\theta}))) \\
-            % &= h(\mat{x})\exp(\t{\mat{\eta}_{1y}(\theta)}\mat{x} + \t{\vec(\mat{x}\circ\mat{x})}\mat{\eta}_2(\mat{\theta}) - b(\mat{\eta}_y(\mat{\theta}))) \\
             &= h(\mat{x})\exp(\t{(\overline{\mat{\eta}} + \mat{\beta}\mat{f}_y)}\mat{x} + c\,\t{\mat{x}}\mat{\Omega}\,\mat{x} - b(\mat{\eta}_y(\mat{\theta}))).
     \end{align*}
     using the relation of $\mat{\theta}$ to the natural parameters given by $\mat{\eta}_{1y}(\mat{\theta}) = \overline{\mat{\eta}} + \mat{\beta}\mat{f}_y$ and $\mat{\eta}_2(\theta) = c\,\mat{\Omega}$.
-    % where the number of unknown parameters is $p + \dim(\StiefelNonCompact{p}{q}) + \dim(\SymPosDefMat{p}) = p\frac{p + 2 q + 3}{2}$.
 \end{example}
 
 \begin{example}[Matrix valued $\mat{X}$ ($r = 2$)]
@@ -518,7 +489,7 @@ The maximum likelihood estimate of $\mat{\theta}_0$ is the solution to the optim
 \end{equation}
 with $\hat{\mat{\theta}}_n = (\vec\widehat{\overline{\ten{\eta}}}, \vec\widehat{\mat{B}}, \vech\widetilde{\mat{\Omega}})$ where $\widehat{\mat{B}} = \bigkron_{k = r}^{1}\widehat{\mat{\beta}}_k$ and $\widehat{\mat{\Omega}} = \bigkron_{k = r}^{1}\widehat{\mat{\Omega}}_k$.
 
-A straightforward and general method for parameter estimation is \emph{gradient descent}. To apply gradient based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.
+A straightforward and general method for parameter estimation is \emph{gradient descent}. To apply gradient-based optimization, we compute the gradients of $l_n$ in \cref{thm:grad}.
 
 \begin{theorem}\label{thm:grad}
     For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood is of the form in \eqref{eq:log-likelihood} with $\mat{\theta}$ being the collection of all GMLM parameters $\overline{\ten{\eta}}$, ${\mat{B}} = \bigkron_{k = r}^{1}{\mat{\beta}}_k$ and ${\mat{\Omega}} = \bigkron_{k = r}^{1}{\mat{\Omega}}_k$ for $k = 1, ..., r$. Let $\ten{G}_2(\mat{\eta}_y)$ be a tensor of dimensions $p_1, \ldots, p_r$ such that
@@ -535,42 +506,31 @@ A straightforward and general method for parameter estimation is \emph{gradient
 If $\mat{T}_2$ is the identity matrix $\mat{I}_{p(p + 1) / 2}$, then $\ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y)$.
 \end{theorem}
 
-Although the general case of any GMLM model can be fitted via gradient descent using \cref{thm:grad}, this may be very inefficient. In \cref{thm:grad}, $\mat{T}_2$ can be used to introduce flexible second moment structures. For example, it allows modeling effects differently for predictor components, as described in \cref{sec:ising_estimation} after Eqn. \eqref{eq:ising-cond-prob}. In the remainder, we focus on  $\mat{T}_2$'s that are identity matrices. This approach simplifies the estimation algorithm and the speed of the numerical calculation in  the case of tensor normal predictors. % In the case of the tensor normal distribution, 
-An iterative cyclic updating scheme is derived in \cref{sec:tensor-normal-estimation}, which  has much faster convergence, is stable and does not require hyperparameters, as will be discussed later. On the other hand, the Ising model does not allow such a scheme. There we need to use a gradient-based method, which is the subject of \cref{sec:ising_estimation}.
+Although the general case of any GMLM model can be fitted via gradient descent using \cref{thm:grad}, this may be very inefficient. In \cref{thm:grad}, $\mat{T}_2$ can be used to introduce flexible second moment structures. For example, it allows modeling effects differently for predictor components, as described in \cref{sec:ising_estimation} after Eqn. \eqref{eq:ising-cond-prob}. In the remainder, we focus on  $\mat{T}_2$'s that are identity matrices. This approach simplifies the estimation algorithm and the speed of the numerical calculation in the case of multi-linear normal predictors. An iterative cyclic updating scheme is derived in \cref{sec:tensor-normal-estimation}, which has much faster convergence, is stable, and does not require hyperparameters, as will be discussed later. On the other hand, the Ising model does not allow such a scheme. There we need to use a gradient-based method, which is the subject of \cref{sec:ising_estimation}.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Tensor Normal}\label{sec:tensor-normal-estimation}
+\subsection{Multi-Linear Normal}\label{sec:tensor-normal-estimation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-The \emph{multi-linear normal} is the extension of the matrix normal to tensor-valued random variables and a member of the quadratic exponential family \eqref{eq:quadratic-exp-fam} under \eqref{eq:eta2}. \textcite{Dawid1981,Arnold1981} introduced the term matrix normal and, in particular, \textcite{Arnold1981} provided several theoretical results, such as its density, moments and conditional distributions of its components. The matrix normal distribution is  a bilinear normal distribution; a distribution of a two-way array, each component
-representing a vector of observations \parencite{OhlsonEtAl2013}. \textcite{KolloVonRosen2005,Hoff2011,OhlsonEtAl2013} presented the extension of the bilinear to the multilinear normal distribution, what we call tensor normal, using a parallel extension of bilinear matrices to multilinear tensors \parencite{Comon2009}.
+The \emph{multi-linear normal} is the extension of the matrix normal to tensor valued random variables and a member of the quadratic exponential family \eqref{eq:quadratic-exp-fam} under \eqref{eq:eta2}. \textcite{Dawid1981,Arnold1981} introduced the term matrix normal and, in particular, \textcite{Arnold1981} provided several theoretical results, such as its density, moments and conditional distributions of its components. The matrix normal distribution is  a bilinear normal distribution; a distribution of a two-way array, each component
+representing a vector of observations \parencite{OhlsonEtAl2013}. \textcite{KolloVonRosen2005,Hoff2011,OhlsonEtAl2013} presented the extension of the bilinear to the multilinear normal distribution, using a parallel extension of bilinear matrices to multi-linear tensors \parencite{Comon2009}.
 
-The defining feature of the matrix normal distribution, and its tensor extension, is the Kronecker product structure of its covariance. This formulation, where the covariates are multivariate normal with multiway covariance structure modeled as a Kronecker product of matrices of much lower dimension, aims to overcome the significant modeling and computational challenges arising from the high computational complexity of manipulating tensor representations \parencite[see, e.g.,][]{HillarLim2013,WangEtAl2022}. 
+The defining feature of the matrix normal distribution, and its multi-linear extension, is the Kronecker product structure of its covariance. This formulation, where the covariates are multivariate normal with multiway covariance structure modeled as a Kronecker product of matrices of much lower dimension, aims to overcome the significant modeling and computational challenges arising from the high computational complexity of manipulating tensor representations \parencite[see, e.g.,][]{HillarLim2013,WangEtAl2022}. 
 
-Multilinear tensor normal %Kronecker-separable covariance 
-models have been used in various applications, including
-medical imaging \parencite{BasserPajevic2007,DrydenEtAl2009}, spatio-temporal data analysis \parencite{GreenewaldHero2014}, regression analysis
-for longitudinal relational data \parencite{Hoff2015}.
-%, radar [AFJ10], and multiple-input-multiple-output (MIMO) communications [WJS08]. 
-One of the most important uses of the multilinear normal (MLN) distribution, and hence tensor analysis, is perhaps in magnetic resonance imaging (MRI)  \parencite{OhlsonEtAl2013}.
-A recent survey \parencite{WangEtAl2022} and references therein contain more information and potential applications of multilinear tensor normal models.
-%The first occurrence of the \textit{matrix normal} we found, even though not explicitly called as such, was in \textcite{SrivastavaKhatri1979}. 
-
-Suppose $\ten{X}\mid Y = y$ follows a tensor normal distribution  with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$. We assume the distribution is non-degenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. Its density is given by
+Suppose $\ten{X}\mid Y = y$ follows a multi-linear normal distribution  with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$. We assume the distribution is non-degenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. Its density is given by
 \begin{displaymath}
     f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{-p / 2 p_k}\exp\left( -\frac{1}{2}\left\langle\ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{-1} \right\rangle \right).
 \end{displaymath}
-For the sake of simplicity and w.l.o.g., we assume $\ten{X}$ has 0 marginal expectation; i.e., $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam}, determines the scaling constant $c = -1/2$.  The relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ is
+For the sake of simplicity and w.l.o.g., we assume $\ten{X}$ has 0 marginal expectation; i.e., $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam} determines the scaling constant $c = -1/2$.  The relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ is
 \begin{equation}\label{eq:tnormal_cond_params}
     \ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k, \qquad
     \mat{\Omega}_k = \mat{\Sigma}_k^{-1},
 \end{equation}
-where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification since then $\mat{T}_2$ in \eqref{eq:t-stat}  equals the identity. This also means that the gradients of the log-likelihood $l_n$ in \cref{thm:grad} are simpler. We obtain
+where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This leads to another simplification since then $\mat{T}_2$ in \eqref{eq:t-stat}  equals the identity. This also means that the gradients of the log-likelihood $l_n$ in \cref{thm:grad} are simpler. We obtain
 \begin{displaymath}
     \ten{g}_1(\mat{\eta}_y) = \E[\ten{X}\mid Y = y] = \ten{\mu}_y, \qquad
     \ten{G}_2(\mat{\eta}_y) = \ten{g}_2(\mat{\eta}_y) = \E[\ten{X}\circ\ten{X}\mid Y = y] \equiv \bigkron_{k = r}^1\mat{\Sigma}_k + (\vec{\ten{\mu}}_y)\t{(\vec{\ten{\mu}}_y)}.
 \end{displaymath}
-In practice, we assume we have a random sample of $n$ observations $(\ten{X}_i, \ten{F}_{y_i})$ from the joint distribution. We start the estimation process by demeaning the data. Then, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. To solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$ we initialize  the parameters using a simple heuristic approach. % For initial estimates $\hat{\mat{\beta}}_k^{(0)}$ we  
-First, we compute moment based mode-wise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)$ as
+In practice, we assume we have a random sample of $n$ observations $(\ten{X}_i, \ten{F}_{y_i})$ from the joint distribution. We start the estimation process by demeaning the data. Then, only the reduction matrices $\mat{\beta}_k$ and the scatter matrices $\mat{\Omega}_k$ need to be estimated. To solve the optimization problem \eqref{eq:mle}, with $\overline{\ten{\eta}} = 0$ we initialize the parameters using a simple heuristic approach. First, we compute moment based mode-wise marginal covariance estimates $\widehat{\mat{\Sigma}}_k(\ten{X})$ and $\widehat{\mat{\Sigma}}_k(\ten{F}_Y)$ as
 \begin{displaymath}
     \widehat{\mat{\Sigma}}_k(\ten{X})   = \frac{1}{n}\sum_{i = 1}^{n} (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}, \qquad
     \widehat{\mat{\Sigma}}_k(\ten{F}_Y) = \frac{1}{n}\sum_{i = 1}^{n} (\ten{F}_{y_i})_{(k)}\t{(\ten{F}_{y_i})_{(k)}}.
@@ -601,7 +561,6 @@ Given  $\hat{\mat{\beta}}_1, \ldots, \hat{\mat{\beta}}_r, \hat{\mat{\Omega}}_1,
     \biggr)
         \hat{\mat{\Omega}}_j.
 \end{equation}
-%For the scatter matrices $\mat{\Omega}_j$, we need to fudge a bit. 
 Equating the partial gradient of the $j$th scatter matrix $\mat{\Omega}_j$ in \cref{thm:grad} to zero ( $\nabla_{\mat{\Omega}_j}l_n = 0$) gives a quadratic matrix equation due to the dependence of $\ten{\mu}_y$ on $\mat{\Omega}_j$. In practice though, it is faster, more stable,  and equally accurate to use mode-wise covariance estimates via the residuals
 \begin{displaymath}
     \hat{\ten{R}}_i = \ten{X}_i - \hat{\ten{\mu}}_{y_i} = \ten{X}_i - \ten{F}_{y_i}\mlm_{k = 1}^{r}\hat{\mat{\Omega}}_k^{-1}\hat{\mat{\beta}}_k.
@@ -619,7 +578,7 @@ so that
     \tilde{s} = \biggl(\Bigl(\prod_{k = 1}^{r}\tr{\tilde{\mat{\Sigma}}_k}\Bigr)^{-1}\frac{1}{n}\sum_{i = 1}^n \langle \hat{\ten{R}}_i, \hat{\ten{R}}_i \rangle\biggr)^{1 / r}
 \end{displaymath}
 resulting in the estimates $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j)^{-1}$.
-Estimation is then performed by updating  the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution} for $j = 1, \ldots, r$,  and then recompute the $\hat{\mat{\Omega}}_j$ estimates simultaneously keeping the $\hat{\mat{\beta}}_j$'s fixed. This procedure is repeated until convergence. % Convergence is very fast, experiments showed that convergence occures usualy in less than $10$ iterations.
+Estimation is then performed by updating  the estimates $\hat{\mat{\beta}}_j$ via \eqref{eq:tensor_normal_beta_solution} for $j = 1, \ldots, r$,  and then recompute the $\hat{\mat{\Omega}}_j$ estimates simultaneously keeping the $\hat{\mat{\beta}}_j$'s fixed. This procedure is repeated until convergence.
 
 A technical detail for numerical stability is to ensure that the scaled values $\tilde{s}\tilde{\mat{\Sigma}}_j$, assumed to be symmetric and positive definite, are well conditioned. Thus, we estimate the condition number of $\tilde{s}\tilde{\mat{\Sigma}}_j$ before computing the inverse. In case of ill-conditioning, we use the regularized $\hat{\mat{\Omega}}_j = (\tilde{s}\tilde{\mat{\Sigma}}_j + 0.2 \lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)\mat{I}_{p_j})^{-1}$ instead, where $\lambda_{1}(\tilde{s}\tilde{\mat{\Sigma}}_j)$ is the first (maximum) eigenvalue. Experiments showed that this regularization is usually only required in the first few iterations.
 
@@ -631,11 +590,11 @@ similar “flip-flop” approach by iteratively updating the $\mat{\beta}_k$'s a
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Ising Model}\label{sec:ising_estimation}
+\subsection{Multi-Linear Ising Model}\label{sec:ising_estimation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-The Ising\footnote{Also known as the \emph{Lenz-Ising} model as the physical assumptions of the model where developed by both Lenz and Ising \parencite{Niss2005}. Ising gave a closed form solution for the 1-dimensional lattice, that is, a linear chain \parencite{Ising1925}.} model \parencite{Lenz1920,Ising1925,Niss2005} is a mathematical model originating in statistical physics to study ferromagnetism in a thermodynamic setting. It describes magentic dipoles (atomic ``spins'') which can take two states ($\pm 1$) while allowing two-way interactions between direct neighbours on a lattice, a discrete grid. The model assumes all elementary magnets to be the same, which translates to all having the same coupling strength (two-way interactions) governed by a single parameter relating to the temperature of the system. Nowadays, the Ising model, in its general form, allows for different coupling strength for every (symmetric) interaction as well as an external magnetic field acting on every magnetic dipole separately. A review is given by \textcite{NguyenEtAl2017}.
+% The Ising\footnote{Also known as the \emph{Lenz-Ising} model as the physical assumptions of the model where developed by both Lenz and Ising \parencite{Niss2005}. Ising gave a closed form solution for the 1-dimensional lattice, that is, a linear chain \parencite{Ising1925}.} model \parencite{Lenz1920,Ising1925,Niss2005} is a mathematical model originating in statistical physics to study ferromagnetism in a thermodynamic setting. It describes magentic dipoles (atomic ``spins'') which can take two states ($\pm 1$) while allowing two-way interactions between direct neighbours on a lattice, a discrete grid. The model assumes all elementary magnets to be the same, which translates to all having the same coupling strength (two-way interactions) governed by a single parameter relating to the temperature of the system. Nowadays, the Ising model, in its general form, allows for different coupling strength for every (symmetric) interaction as well as an external magnetic field acting on every magnetic dipole separately. A review is given by \textcite{NguyenEtAl2017}.
 
-In statistics, the Ising model is used to model multivariate binary data. That is, the states are ${0, 1}$ instead of $\pm 1$. It is related to a multitude of other models; \emph{Graphical Models} and \emph{Markov Random Fields} to describe conditional dependence \parencite{Lauritzen1996,WainwrightJordan2008,LauritzenRichardson2002}, \emph{Potts models} \parencite{Besag1974,ChakrabortyEtAl2022} which generalize the Ising model to multiple states, the \emph{multivariate Bernoulli distribution} \parencite{Whittaker1990,JohnsonEtAl1997,DaiDingWahba2013} considering also interactions (tree-way and higher), to give the most prominent. 
+% In statistics, the Ising model is used to model multivariate binary data. That is, the states are ${0, 1}$ instead of $\pm 1$. It is related to a multitude of other models; \emph{Graphical Models} and \emph{Markov Random Fields} to describe conditional dependence \parencite{Lauritzen1996,WainwrightJordan2008,LauritzenRichardson2002}, \emph{Potts models} \parencite{Besag1974,ChakrabortyEtAl2022} which generalize the Ising model to multiple states, the \emph{multivariate Bernoulli distribution} \parencite{Whittaker1990,JohnsonEtAl1997,DaiDingWahba2013} considering also interactions (tree-way and higher), to give the most prominent. 
 
 The $p$-dimensional Ising model is a discrete probability distribution on the set of $p$-dimensional binary vectors $\mat{x}\in\{0, 1\}^p$ with probability mass function (pmf) given by
 \begin{displaymath}
@@ -652,34 +611,30 @@ Abusing notation, we let $\mat{\gamma}_{j l}$ denote the element of $\mat{\gamma
 \end{align}
 Conditional Ising models, incorporating the information of covariates $Y$ into the model, were considered by \textcite{ChengEtAl2014,BuraEtAl2022}. The direct way is to parameterize $\mat{\gamma} = \mat{\gamma}_y$ by the covariate $Y = y$ to model a conditional distribution $P_{\mat{\gamma}_y}(\mat{x}\mid Y = y)$.
 
-We extend the conditional pmf by allowing the binary variables to be tensor valued; that is,  we set $\mat{x} = \vec{\ten{X}}$, with dimension $p = \prod_{k = 1}^{r}p_k$ for $\ten{X}\in\{ 0, 1 \}^{p_1\times\cdots\times p_r}$. The tensor structure of $\ten{X}$ is accommodated  by assuming Kronecker product constraints to the parameter vector $\mat{\gamma}_y$ in a similar fashion as for the tensor normal model. This means that we compare the pmf $P_{\mat{\gamma}_y}(\vec{\ten{X}} | Y = y)$ with the quadratic exponential family \eqref{eq:quadratic-exp-fam} with the natural parameters modeled by \eqref{eq:eta1} and \eqref{eq:eta2}. A detail to be considered is that the diagonal of $(\vec{\ten{X}})\t{(\vec{\ten{X}})}$ is equal to $\vec{\ten{X}}$, which results in the GMLM being expressed as
+We extend the conditional pmf by allowing the binary variables to be tensor valued; that is, we set $\mat{x} = \vec{\ten{X}}$, with dimension $p = \prod_{k = 1}^{r}p_k$ for $\ten{X}\in\{ 0, 1 \}^{p_1\times\cdots\times p_r}$. The tensor structure of $\ten{X}$ is accommodated by assuming Kronecker product constraints to the parameter vector $\mat{\gamma}_y$ in a similar fashion as for the multi-linear normal model. This means that we compare the pmf $P_{\mat{\gamma}_y}(\vec{\ten{X}} | Y = y)$ with the quadratic exponential family \eqref{eq:quadratic-exp-fam} with the natural parameters modeled by \eqref{eq:eta1} and \eqref{eq:eta2}. A detail to be considered is that the diagonal of $(\vec{\ten{X}})\t{(\vec{\ten{X}})}$ is equal to $\vec{\ten{X}}$, which results in the GMLM being expressed as
 \begin{align}
     P_{\mat{\gamma}_y}(\ten{X} \mid Y = y)
         &= p_0(\mat{\gamma}_y)\exp(\t{\vech((\vec{\ten{X}})\t{(\vec{\ten{X}})})}\mat{\gamma}_y) \nonumber \\
         &= p_0(\mat{\gamma}_y)\exp\Bigl(\Bigl\langle \ten{X}, \ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k \Bigr\rangle + \Bigl\langle\ten{X}\mlm_{k = 1}^{r}\mat{\Omega}_k, \ten{X}\Bigr\rangle\Bigr)\label{eq:ising-cond-prob}
 \end{align}
-where we set $\overline{\ten{\eta}} = 0$ and $\mat{T}_2$ to the identity. This imposes an additional constraint to the model, the reason is that the diagonal elements of $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$ take the role of $\overline{\ten{\eta}}$, although not fully. Having the diagonal of $\mat{\Omega}$ and $\overline{\ten{\eta}}$ handling the self interaction effects might lead to interference in the optimization routine. Another approach would be to use the $\mat{T}_2$ matrix to set the corresponding diagonal elements of $\mat{\Omega}$ to zero and let $\overline{\ten{\eta}}$ handle the self interaction effect. All of these approaches, namely setting $\overline{\ten{\eta}} = 0$, keeping $\overline{\ten{\eta}}$ or using $\mat{T}_2$, are theoretically solid and compatible with \cref{thm:grad,thm:param-manifold,thm:asymptotic-normality-gmlm}, assuming all axis dimensions $p_k$ are non-degenerate, that is $p_k > 1$ for all $k = 1, \ldots, r$. Regardless, under our modeling choice, the relation between the natural parameters $\mat{\gamma}_y$ of the conditional Ising model and the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$ is
+where we set $\overline{\ten{\eta}} = 0$ and $\mat{T}_2$ to the identity. This imposes an additional constraint to the model, the reason is that the diagonal elements of $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$ take the role of $\overline{\ten{\eta}}$, although not fully. Having the diagonal of $\mat{\Omega}$ and $\overline{\ten{\eta}}$ handling the self-interaction effects might lead to interference in the optimization routine. Another approach would be to use the $\mat{T}_2$ matrix to set the corresponding diagonal elements of $\mat{\Omega}$ to zero and let $\overline{\ten{\eta}}$ handle the self-interaction effect. All of these approaches, namely setting $\overline{\ten{\eta}} = 0$, keeping $\overline{\ten{\eta}}$ or using $\mat{T}_2$, are theoretically solid and compatible with \cref{thm:grad,thm:param-manifold,thm:asymptotic-normality-gmlm}, assuming all axis dimensions $p_k$ are non-degenerate, that is $p_k > 1$ for all $k = 1, \ldots, r$. Regardless, under our modeling choice, the relation between the natural parameters $\mat{\gamma}_y$ of the conditional Ising model and the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$ is
 \begin{equation}\label{eq:ising-natural-params}
-    % \t{\pinv{\mat{D}_p}}\mat{\gamma}_y
-    %     = \vec(\mat{\Omega} + \diag(\mat{B}\vec{\ten{F}_y}))
-    %     = \vec\Biggl(\bigkron_{k = r}^{1}\mat{\Omega}_k + \diag\biggl(\vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr)\biggr)\Biggr).
     \mat{\gamma}_y
         = \t{\mat{D}_p}\vec(\mat{\Omega} + \diag(\mat{B}\vec{\ten{F}_y}))
         = \t{\mat{D}_p}\vec\Biggl(\bigkron_{k = r}^{1}\mat{\Omega}_k + \diag\biggl(\vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr)\biggr)\Biggr).
 \end{equation}
-In contract to the tensor normal GMLM, the matrices $\mat{\Omega}_k$ are only required to be symmetric. More specifically, we require $\mat{\Omega}_k$, for $k = 1, \ldots, r$, to be elements of an embedded submanifold of $\SymMat{p_k}$ (see: \cref{sec:kron-manifolds,sec:matrix-manifolds}). The mode wise reduction matrices $\mat{\beta}_k$ are elements of an embedded submanifold of $\mathbb{R}^{p_k\times q_k}$. Common choices are listed in \cref{sec:matrix-manifolds}.
+In contrast to the multi-linear normal GMLM, the matrices $\mat{\Omega}_k$ are only required to be symmetric. More specifically, we require $\mat{\Omega}_k$, for $k = 1, \ldots, r$, to be elements of an embedded submanifold of $\SymMat{p_k}$ (see: \cref{sec:kron-manifolds,sec:matrix-manifolds}). The mode wise reduction matrices $\mat{\beta}_k$ are elements of an embedded submanifold of $\mathbb{R}^{p_k\times q_k}$. Common choices are listed in \cref{sec:matrix-manifolds}.
 
 To solve the optimization problem \eqref{eq:mle}, given a data set $(\ten{X}_i, y_i)$,  $i = 1, \ldots, n$, we use a variation of gradient descent.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsubsection{Initial Values}
 
-The first step is to get reasonable starting values. Experiments showed that a good starting value of $\mat{\beta}_k$ is to use the tensor normal estimates from \cref{sec:tensor-normal-estimation} for $k = 1, \ldots, r$, considering $\ten{X}_i$ as continuous. For initial values of $\mat{\Omega}_k$, a different approach is required. Setting everything to the uninformed initial value, that is $\mat{\Omega}_k = \mat{0}$ as this corresponds to the conditional log odds to be $1:1$ for every component and pairwise interaction. This is not possible, since $\mat{0}$ is a stationary point of the log-likelihood. This is directly observed by considering the partial gradients of the log-likelihood in \cref{thm:grad}. Instead, we use a crude heuristic which threads every mode seperately and ignores any relation to the covariates. It is computationaly cheap and better than any of the alternatives we considered. For every $k = 1, \ldots, r$, let the $k$th mode second moment estimate be
+The first step is to get reasonable starting values. Experiments showed that a good starting value of $\mat{\beta}_k$ is to use the multi-linear normal estimates from \cref{sec:tensor-normal-estimation} for $k = 1, \ldots, r$, considering $\ten{X}_i$ as continuous. For initial values of $\mat{\Omega}_k$, a different approach is required. Setting everything to the uninformed initial value, that is $\mat{\Omega}_k = \mat{0}$ as this corresponds to the conditional log odds to be $1:1$ for every component and pairwise interaction. This is not possible, since $\mat{0}$ is a stationary point of the log-likelihood. This is directly observed by considering the partial gradients of the log-likelihood in \cref{thm:grad}. Instead, we use a crude heuristic that threads every mode separately and ignores any relation to the covariates. It is computationally cheap and better than any of the alternatives we considered. For every $k = 1, \ldots, r$, let the $k$th mode second moment estimate be
 \begin{equation}\label{eq:ising-mode-moments}
     \hat{\mat{M}}_{2(k)} = \frac{p_k}{n p}\sum_{i = 1}^n (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}
 \end{equation}
-which contains the $k$th mode first moment estimate in its diagonal $\hat{\mat{M}}_{1(k)} = \diag\hat{\mat{M}}_{2(k)}$. Considering every column of the matricized observation $(\ten{X}_i)_{(k)}$ as a $p_k$ dimensional observation. The number of those artificially generated observations is $n \prod_{j\neq k}p_j$. Let $Z_k$ denote the random variable those artificial observations are realization of. Then, we can interpret the elements $(\hat{\mat{M}}_{1(k)})_{j}$ as the estimates of the probability $P((Z_k)_j = 1)$, that is the marginal probability of the $j$th element of $Z_k$ being $1$. Similar, for $l \neq j$ we have $(\hat{\mat{M}}_{2(k)})_{j l}$ estimating $P((Z_k)_j = 1, (Z_k)_l = 1)$, the marginal probability of two-way interactions. % Without any regard of accuracy ...
-Now, we set the diagonal elements of $\mat{\Omega}_k$ to zero. For the off diagonal elements of $\mat{\Omega}_k$, we equate the conditional probabilities $P((Z_k)_j = 1 \mid (Z_k)_{-j} = \mat{0})$ and $P((Z_k)_j = 1, (Z_k)_l = 1\mid (Z_k)_{-j, -l} = \mat{0})$ with the marginal probability estimates $(\hat{\mat{M}}_{1(k)})_{j}$ and $(\hat{\mat{M}}_{2(k)})_{j l}$, respectively. Applying \eqref{eq:ising-two-way-log-odds} then gives the initial component-wise estimates $\hat{\mat{\Omega}}_k^{(0)}$, 
+which contains the $k$th mode first moment estimate in its diagonal $\hat{\mat{M}}_{1(k)} = \diag\hat{\mat{M}}_{2(k)}$. Considering every column of the matricized observation $(\ten{X}_i)_{(k)}$ as a $p_k$ dimensional observation. The number of those artificially generated observations is $n \prod_{j\neq k}p_j$. Let $Z_k$ denote the random variable those artificial observations are realization of. Then, we can interpret the elements $(\hat{\mat{M}}_{1(k)})_{j}$ as the estimates of the probability $P((Z_k)_j = 1)$, that is the marginal probability of the $j$th element of $Z_k$ being $1$. Similar, for $l \neq j$ we have $(\hat{\mat{M}}_{2(k)})_{j l}$ estimating $P((Z_k)_j = 1, (Z_k)_l = 1)$, the marginal probability of two-way interactions. Now, we set the diagonal elements of $\mat{\Omega}_k$ to zero. For the off diagonal elements of $\mat{\Omega}_k$, we equate the conditional probabilities $P((Z_k)_j = 1 \mid (Z_k)_{-j} = \mat{0})$ and $P((Z_k)_j = 1, (Z_k)_l = 1\mid (Z_k)_{-j, -l} = \mat{0})$ with the marginal probability estimates $(\hat{\mat{M}}_{1(k)})_{j}$ and $(\hat{\mat{M}}_{2(k)})_{j l}$, respectively. Applying \eqref{eq:ising-two-way-log-odds} then gives the initial component-wise estimates $\hat{\mat{\Omega}}_k^{(0)}$, 
 \begin{equation}\label{eq:ising-init-Omegas}
     (\hat{\mat{\Omega}}_k^{(0)})_{j j} = 0,
     \qquad
@@ -696,36 +651,36 @@ The natural parameter $\mat{\gamma}_y$ is evaluated via \eqref{eq:ising-natural-
 \begin{displaymath}
     \mat{\theta}^{(I + 1)} = \mat{\theta}^{(I)} + \lambda\nabla_{\mat{\theta}} l_n(\mat{\theta})\bigr|_{\mat{\theta} = \mat{\theta}^{(I)}},
 \end{displaymath}
-which is iterated till convergence. In practice, iteration is performed until either a maximum number of iterations is exhausted and/or some break condition is satisfied. A proper choice of the learning rate is needed as a large learning rate $\lambda$ may cause instability, while a very low learning rate requires an enormous amount of iterations. Generically, there are two approaches to avoid the need to determine a proper learning rate. First, \emph{line search methods} determine an appropriate step size for every iteration. This works well if the evaluation of the object function (the log-likelihood) is cheap. This is not the case in our setting, see \cref{sec:ising-bigger-dim}. The second approach is an \emph{adaptive learning rate}, where one tracks specific statistics while optimizing and dynamically adapting the learning rate via well-tested heuristics using the gathered knowledge from past iterations. We opted to use an adaptive learning rate approach, which not only removes the need to determine an appropriate learning rate, but also accelerates learning.
+which is iterated till convergence. In practice, iteration is performed until either a maximum number of iterations is exhausted and/or some break condition is satisfied. A proper choice of the learning rate is needed as a large learning rate $\lambda$ may cause instability, while a very low learning rate requires an enormous amount of iterations. Generically, there are two approaches to avoid the need to determine a proper learning rate. First, \emph{line search methods} determine an appropriate step size for every iteration. This works well if the evaluation of the object function (the log-likelihood) is cheap. This is not the case in our setting, see \cref{sec:ising-bigger-dim}. The second approach is an \emph{adaptive learning rate}, where one tracks specific statistics while optimizing and dynamically adapting the learning rate via well-tested heuristics using the gathered knowledge from past iterations. We opted to use an adaptive learning rate approach, which not only removes the need to determine an appropriate learning rate but also accelerates learning.
 
-Our method of choice is \emph{root mean squared propagation} (RMSprop) \parencite{Hinton2012}. This is a well known method in machine learning for training neural networks. It is a variation of gradient descent with a per scalar parameter adaptive learning rate. It tracks a moving average of the element wise squared gradient $\mat{g}_2^{(I)}$, which is then used to scale (element wise) the gradient in the update rule. See \textcite{Hinton2012,GoodfellowEtAl2016} among others. The update rule using RMSprop for maximization\footnote{Instead of the more common minimization, therefore $+$ in the update of $\mat{\theta}$.} is
+Our method of choice is \emph{root mean squared propagation} (RMSprop) \parencite{Hinton2012}. This is a well-known method in machine learning for training neural networks. It is a variation of gradient descent with a per scalar parameter adaptive learning rate. It tracks a moving average of the element-wise squared gradient $\mat{g}_2^{(I)}$, which is then used to scale (element-wise) the gradient in the update rule. See \textcite{Hinton2012,GoodfellowEtAl2016} among others. The update rule using RMSprop for maximization\footnote{Instead of the more common minimization, therefore $+$ in the update of $\mat{\theta}$.} is
 \begin{align*}
     \mat{g}_2^{(I + 1)} &= \nu \mat{g}_2^{(I)} + (1 - \nu)\nabla l_n(\mat{\theta}^{(I)})\odot\nabla l_n(\mat{\theta}^{(I)}), \\
     \mat{\theta}^{(I + 1)} &= \mat{\theta}^{(I)} + \frac{\lambda}{\sqrt{\mat{g}_2^{(I + 1)}} + \epsilon}\odot\nabla l_n(\mat{\theta}^{(I)}).
 \end{align*}
-The parameters $\nu = 0.9$, $\lambda = 10^{-3}$ and $\epsilon\approx 1.49\cdot 10^{-8}$ are fixed. The initial value of $\mat{g}_2^{(0)} = \mat{0}$, the symbol $\odot$ denotes the Hadamard product, or  element wise multiplication. The division and square root operation are performed element wise as well. According to our experiments, RMSprop requires iterations in the range of $50$ till $1000$ till convergence while gradient ascent with a learning rate of $10^{-3}$ is in the range of $1000$ till $10000$.
+The parameters $\nu = 0.9$, $\lambda = 10^{-3}$ and $\epsilon\approx 1.49\cdot 10^{-8}$ are fixed. The initial value of $\mat{g}_2^{(0)} = \mat{0}$, the symbol $\odot$ denotes the Hadamard product, or  element wise multiplication. The division and square root operations are performed element-wise as well. According to our experiments, RMSprop requires iterations in the range of $50$ till $1000$ till convergence while gradient ascent with a learning rate of $10^{-3}$ is in the range of $1000$ till $10000$.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsubsection{Small Data Sets}\label{sec:ising-small-data-sets}
-In case of a finite number of observations, specifically in data sets with a small number of observations $n$, the situation where one components is always ether zero or one can occur. Its also possible to observe two exclusive components. This situation of a ``degenerate'' data set needs to be safeguarded against in practice. Working with parameters on a log-scale, this gives estimates of $\pm\infty$. This is outside of the parameter space and breaks our optimization algorithm.
+In the case of a finite number of observations, specifically in data sets with a small number of observations $n$, the situation where one components is always ether zero or one can occur. It is also possible to observe two exclusive components. This situation of a ``degenerate'' data set needs to be safeguarded against in practice. Working with parameters on a log-scale, this gives estimates of $\pm\infty$. This is outside of the parameter space and breaks our optimization algorithm.
 
-The first situation where this needs to be addressed is in \eqref{eq:ising-init-Omegas}, where we set initial estimates for $\mat{\Omega}_k$. To avoid divition by zero as well as evaluating the log of zero, we addapt \eqref{eq:ising-mode-moments}, the mode wise moment estimates $\hat{\mat{M}}_{2(k)}$. A simple method is to replace the ``degenerate'' components, that are entries with value $0$ or $1$, with the smallest positive estimate of exactly one occurrence $p_k / n p$, or all but one occurrence $1 - p_k / n p$, respectively.
+The first situation where this needs to be addressed is in \eqref{eq:ising-init-Omegas}, where we set initial estimates for $\mat{\Omega}_k$. To avoid division by zero as well as evaluating the log of zero, we adapt \eqref{eq:ising-mode-moments}, the mode-wise moment estimates $\hat{\mat{M}}_{2(k)}$. A simple method is to replace the ``degenerate'' components, that are entries with value $0$ or $1$, with the smallest positive estimate of exactly one occurrence $p_k / n p$, or all but one occurrence $1 - p_k / n p$, respectively.
 
-The same problem is present in gradient optimization. Therefore, before starting the optimization, we detect degenerate combinations. We compute upper and lower bounds for the ``degenerate'' element in the Kronecker product $\hat{\mat{\Omega}} = \bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k$. After every gradient update, we check if any of the ``degenerate'' elements fall outside of the bounds. In that case, we adjust all the elements of the Kronecker component estimates $\hat{\mat{\Omega}}_k$, corresponding to the ``degenerate'' element of their Kronecker product, to fall inside the precomputed bounds. While doing so, we try to alter every component as little as possible to ensure that the non-degenerate elements in $\hat{\mat{\Omega}}$, effected by this change due to its Kronecker structure, are altered as little as possible. The exact details are technically cumbersome while providing little insight.
+The same problem is present in gradient optimization. Therefore, before starting the optimization, we detect degenerate combinations. We compute upper and lower bounds for the ``degenerate'' element in the Kronecker product $\hat{\mat{\Omega}} = \bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k$. After every gradient update, we check if any of the ``degenerate'' elements fall outside of the bounds. In that case, we adjust all the elements of the Kronecker component estimates $\hat{\mat{\Omega}}_k$, corresponding to the ``degenerate'' element of their Kronecker product, to fall inside the precomputed bounds. While doing so, we try to alter every component as little as possible to ensure that the non-degenerate elements in $\hat{\mat{\Omega}}$, affected by this change due to its Kronecker structure, are altered as little as possible. The exact details are technically cumbersome while providing little insight.
 
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsubsection{Slightly Bigger Dimensions}\label{sec:ising-bigger-dim}
-A big challenge for the Ising model is its high computational complexity as it involves  summing over all binary vectors of length $p = \prod_{k = 1}^{r}p_k$ in the partition function \eqref{eq:ising-partition-function}. Computing the partition function exactly requires to sum all $2^p$ binary vectors. For small dimensions, say $p\approx 10$, this is easily computed. Increasing the dimension beyond $20$ becomes extremely expensive while  it is %absolutely
-impossible for dimension bigger than $30$. Trying to avoid the evaluation of the log-likelihood and only computing its partial gradients via \cref{thm:grad} does not resolve the issue. The gradients require the inverse link, in other words the second moment \eqref{eq:ising-m2}, where, if dropping the scaling factor $p_0$, still involves to sum $2^p$ summands. Basically, with our model, this means that the optimization of the Ising model using exactly computed gradients is impossible for moderately sized problems.
+A big challenge for the Ising model is its high computational complexity as it involves  summing over all binary vectors of length $p = \prod_{k = 1}^{r}p_k$ in the partition function \eqref{eq:ising-partition-function}. Computing the partition function exactly requires to sum all $2^p$ binary vectors. For small dimensions, say $p\approx 10$, this is easily computed. Increasing the dimension beyond $20$ becomes extremely expensive while it is %absolutely
+impossible for a dimension bigger than $30$. Trying to avoid the evaluation of the log-likelihood and only computing its partial gradients via \cref{thm:grad} does not resolve the issue. The gradients require the inverse link, that is the second moment \eqref{eq:ising-m2}, where, if dropping the scaling factor $p_0$, still involves to sum $2^p$ summands. Basically, with our model, this means that the optimization of the Ising model using exactly computed gradients is impossible for moderately sized problems.
 
-For estimation of dimensions $p$ bigger than $20$, we use a Monte-Carlo method to estimate the second moment \eqref{eq:ising-m2}, required to compute the partial gradients of the log-likelihood. Specifically, we use a Gibbs-Sampler to sample from the conditional distribution and approximate the second moment in an importance sampling framework. This can be implemented quite efficiently while the estimation accuracy for the second moment is evaluated experimentally which seems to be very reliable. Simultaneously, we use the same approach to estimate the partition function. This though, is in comparison inaccurate, and may only be used to get a rough idea of the log-likelihood. Regardless, for our method, we only need the gradient for optimization where appropriate break conditions, not based on the likelihood, lead to a working method for MLE estimation.
+For the estimation of dimensions $p$ bigger than $20$, we use a Monte-Carlo method to estimate the second moment \eqref{eq:ising-m2}, required to compute the partial gradients of the log-likelihood. Specifically, we use a Gibbs-Sampler to sample from the conditional distribution and approximate the second moment in an importance sampling framework. This can be implemented quite efficiently while the estimation accuracy for the second moment is evaluated experimentally which seems to be very reliable. Simultaneously, we use the same approach to estimate the partition function. This though, is in comparison inaccurate, and may only be used to get a rough idea of the log-likelihood. Regardless, for our method, we only need the gradient for optimization where appropriate break conditions, not based on the likelihood, lead to a working method for MLE estimation.
 
 \begin{figure}
     \centering
     \includegraphics[]{plots/sim-ising-perft-m2.pdf}
-    \caption{\label{fig:ising-m2-perft}Performance test for computing/estimating the second moment of the Ising model of dimension $p$ using ether the exact method or a Monte-Carlo (MC) simulation.}
+    \caption{\label{fig:ising-m2-perft}Performance test for computing/estimating the second moment of the Ising model of dimension $p$ using either the exact method or a Monte-Carlo (MC) simulation.}
 \end{figure}
 
 
@@ -735,16 +690,16 @@ For estimation of dimensions $p$ bigger than $20$, we use a Monte-Carlo method t
 
 \cref{thm:sdr} identifies the sufficient reduction for the regression of $Y$ on $\ten{X}$ in the population. Any estimation of the sufficient reduction requires application of some optimality criterion. As we operate within the framework of the exponential family, we opted for maximum likelihood estimation (MLE). For the unconstrained problem, where the parameters are simply $\mat{B}$ and $\mat{\Omega}$ in \eqref{eq:eta1-manifold}, maximizing the likelihood of $\ten{X} \mid Y$ is straightforward and yields well-defined MLEs of both parameters. Our setting, though, requires the constrained optimization of the $\ten{X} \mid Y$ likelihood subject to  $\mat{B} = \bigotimes_{j = r}^{1}\mat{\beta}_j$ and $\mat{\Omega}=\bigkron_{j = r}^{1}\mat{\Omega}_j$. \Cref{thm:kron-manifolds,thm:param-manifold} provide the setting for which the MLE of the constrained parameter $\mat{\theta}$ is well-defined, which in turn leads to the derivation of its asymptotic normality.
 
-The main problem in obtaining asymptotic results for the MLE of the constrained parameter $\mat{\theta} = (\overline{\ten{\eta}}, \vec\mat{B}, \vech\mat{\Omega})$ stems from the nature of the constraint. We assumed that $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, where the parameter $\mat{B}$ is identifiable. This means that different values of $\mat{B}$ lead to different densities $f_{\mat{\theta}}(\ten{X}\mid Y = y)$, a basic property needed to ensure consistency of parameter estimates, which in turn is needed for asymptotic normality. On the other hand, the components $\mat{\beta}_j$, $j = 1, \ldots, r$, are \emph{not} identifiable, which is a direct consequence of the equality $\mat{\beta}_2\otimes\mat{\beta}_1 = (c\mat{\beta}_2)\otimes (c^{-1}\mat{\beta}_1)$ for every $c\neq 0$. This is the reason we formulated $\Theta$ as a constrained parameter space instead of parameterizing the densities of $\ten{X}\mid Y$ with respect to the components $\mat{\beta}_1, \ldots, \mat{\beta}_r$. The same is true for $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$.
+The main problem in obtaining asymptotic results for the MLE of the constrained parameter $\mat{\theta} = (\overline{\ten{\eta}}, \vec\mat{B}, \vech\mat{\Omega})$ stems from the nature of the constraint. We assumed that $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, where the parameter $\mat{B}$ is identifiable. This means that different values of $\mat{B}$ lead to different densities $f_{\mat{\theta}}(\ten{X}\mid Y = y)$, a basic property needed to ensure consistency of parameter estimates, which in turn is needed for asymptotic normality. On the other hand, the components $\mat{\beta}_j$, $j = 1, \ldots, r$, are \emph{not} identifiable, which is a direct consequence of the equality $\mat{\beta}_2\otimes\mat{\beta}_1 = (c\mat{\beta}_2)\otimes (c^{-1}\mat{\beta}_1)$ for every $c\neq 0$. This is the reason we considered  $\Theta$ as a constrained parameter space instead of parameterizing the densities of $\ten{X}\mid Y$ with $\mat{\beta}_1, \ldots, \mat{\beta}_r$. The same is true for $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$.
 
-In addition to identifiable parameters, asymptotic normality obtained  in \cref{thm:asymptotic-normality-gmlm} requires  differentiation. Therefore, the space itself needs to admit  defining differentiation, which is usually a vector space. This is too strong an assumption for our purposes. To weaken the vector space assumption we consider \emph{smooth manifolds}. The latter are  spaces which look like Euclidean spaces locally and allow the notion of differentiation. The more general \emph{topological} manifolds are too weak for differentiation. To make matters worse, a smooth manifold only allows first derivatives. Without going into  details, the solution is a \emph{Riemannian manifold}. Similar to an abstract \emph{smooth manifold}, Riemannian manifolds are detached from our usual intuition as well as complicated to handle in an already complicated setting. This is where an \emph{embedded (sub)manifold} comes to the rescue. Simply speaking, an embedded manifold is a manifold which is a subset of a manifold from which it inherits its properties. If a manifold is embedded in a Euclidean space, almost all the complication of the abstract manifold theory simplifies drastically. Moreover, since a Euclidean space is itself a Riemannian manifold, we inherit the means for higher derivatives. Finally, smooth embedded submanifold structure for the parameter space maintains consistency with existing approaches and results for parameter sets with linear subspace structure. These reasons justify the constraint that the parameter space $\Theta$ be an \emph{smooth embedded submanifold} in an open subset $\Xi$ of a Euclidean space.
+In addition to identifiable parameters, the asymptotic normality obtained in \cref{thm:asymptotic-normality-gmlm} requires differentiation. Therefore, the space itself needs to admit defining differentiation, which is usually a vector space. This is too strong an assumption for our purposes. To weaken the vector space assumption we consider \emph{smooth manifolds}. The latter are spaces that look like Euclidean spaces locally and allow the notion of differentiation. The more general \emph{topological} manifolds are too weak for differentiation. To make matters worse, a smooth manifold only allows first derivatives. Without going into details, the solution is a \emph{Riemannian manifold}. Similar to an abstract \emph{smooth manifold}, Riemannian manifolds are detached from our usual intuition as well as complicated to handle in an already complicated setting. This is where an \emph{embedded (sub)manifold} comes to the rescue. Simply speaking, an embedded manifold is a manifold which is a subset of a manifold from which it inherits its properties. If a manifold is embedded in a Euclidean space, almost all the complications of the abstract manifold theory simplify drastically. Moreover, since a Euclidean space is itself a Riemannian manifold, we inherit the means for higher derivatives. Finally, smooth embedded submanifold structure for the parameter space maintains consistency with existing approaches and results for parameter sets with linear subspace structure. These reasons justify the constraint that the parameter space $\Theta$ be an \emph{smooth embedded submanifold} in an open subset $\Xi$ of a Euclidean space.
 
-Now, we directly define a \emph{smooth manifold} embedded in $\mathbb{R}^p$ without any detours to the more generel theory. See for example \textcite{Lee2012,,Lee2018,AbsilEtAl2007,Kaltenbaeck2021} among others.
+Now, we define a \emph{smooth manifold} embedded in $\mathbb{R}^p$ without any detours to the more general theory. See for example \textcite{Lee2012,,Lee2018,AbsilEtAl2007,Kaltenbaeck2021} among others.
 \begin{definition}[Manifolds]\label{def:manifold}
     A set $\manifold{A}\subseteq\mathbb{R}^p$ is an \emph{embedded smooth manifold} of dimension $d$ if for every $\mat{x}\in\manifold{A}$ there exists a smooth\footnote{Here \emph{smooth} means infinitely differentiable or $C^{\infty}$.} bi-continuous map $\varphi:U\cap\manifold{A}\to V$, called a \emph{chart}, with $\mat{x}\in U\subseteq\mathbb{R}^p$ open and $V\subseteq\mathbb{R}^d$ open.
 \end{definition}
 
-We also need the concept of a \emph{tangent space} to formulate asymptotic normality in a way which is independent of a particular coordinate representation. Intuitively, the tangent space at a point $\mat{x}\in\manifold{A}$ of the manifold $\manifold{A}$ is the hyperspace of all velocity vectors $\t{\nabla\gamma(0)}$ of any curve $\gamma:(-1, 1)\to\manifold{A}$ passing through $\mat{x} = \gamma(0)$, see \cref{fig:torus}. Locally, at $\mat{x} = \gamma(0)$ with a chart $\varphi$ we can written $\gamma(t) = \varphi^{-1}(\varphi(\gamma(t)))$ which gives that $\Span\t{\nabla\gamma(0)} \subseteq \Span\t{\nabla\varphi^{-1}(\varphi(\mat{x}))}$. Taking the union over all smooth curves through $\mat{x}$ gives equality. The following definition leverages the simplified setup of smooth manifolds in Euclidean space.
+We also need the concept of a \emph{tangent space} to formulate asymptotic normality in a way that is independent of a particular coordinate representation. Intuitively, the tangent space at a point $\mat{x}\in\manifold{A}$ of the manifold $\manifold{A}$ is the hyperspace of all velocity vectors $\t{\nabla\gamma(0)}$ of any curve $\gamma:(-1, 1)\to\manifold{A}$ passing through $\mat{x} = \gamma(0)$, see \cref{fig:torus}. Locally, at $\mat{x} = \gamma(0)$ with a chart $\varphi$ we can written $\gamma(t) = \varphi^{-1}(\varphi(\gamma(t)))$ which gives that $\Span\t{\nabla\gamma(0)} \subseteq \Span\t{\nabla\varphi^{-1}(\varphi(\mat{x}))}$. Taking the union over all smooth curves through $\mat{x}$ gives equality. The following definition leverages the simplified setup of smooth manifolds in Euclidean space.
 
 \begin{definition}[Tangent Space]\label{def:tangent-space}
     Let $\manifold{A}\subseteq\mathbb{R}^p$ be an embedded smooth manifold and $\mat{x}\in\manifold{A}$. The \emph{tangent space} at $\mat{x}$ of $\manifold{A}$ is defined as
@@ -767,7 +722,7 @@ We also need the concept of a \emph{tangent space} to formulate asymptotic norma
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Kronecker Product Manifolds}\label{sec:kron-manifolds}
 
-As a basis to ensure that the constrained parameter space $\Theta$ is a manifold, which is a requirement  of \cref{thm:param-manifold}, we need \cref{thm:kron-manifolds}. Therefore, we need the notion of a \emph{spherical} set, which is a set $\manifold{A}$, on which the Frobenius norm is constant. That is, $\|\,.\,\|_F:\manifold{A}\to\mathbb{R}$ is constant. Forthermore, we call a scale invariant set $\manifold{A}$ a \emph{cone}, that is $\manifold{A} = \{ c \mat{A} : \mat{A}\in\manifold{A} \}$ for all $c > 0$.
+As a basis to ensure that the constrained parameter space $\Theta$ is a manifold, which is a requirement of \cref{thm:param-manifold}, we need \cref{thm:kron-manifolds}. Therefore, we need the notion of a \emph{spherical} set, which is a set $\manifold{A}$, on which the Frobenius norm is constant. That is, $\|\,.\,\|_F:\manifold{A}\to\mathbb{R}$ is constant. Forthermore, we call a scale invariant set $\manifold{A}$ a \emph{cone}, that is $\manifold{A} = \{ c \mat{A} : \mat{A}\in\manifold{A} \}$ for all $c > 0$.
 
 \begin{theorem}[Kronecker Product Manifolds]\label{thm:kron-manifolds}
     Let $\manifold{A}\subseteq\mathbb{R}^{p_1\times q_1}\backslash\{\mat{0}\}, \manifold{B}\subseteq\mathbb{R}^{p_2\times q_2}\backslash\{\mat{0}\}$ be smooth embedded submanifolds. Assume one of the following conditions holds.
@@ -797,9 +752,9 @@ As a basis to ensure that the constrained parameter space $\Theta$ is a manifold
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Matrix Manifolds}\label{sec:matrix-manifolds}
-A powerful side effect of \cref{thm:param-manifold} is the modeling flexibinity it provides. For example, we can perform low rank regression. Or, we may constrain two-way interactions between direct axis neighbors by using band matrices for the $\mat{\Omega}_k$'s, among others.
+A powerful feature of \cref{thm:param-manifold} is the modeling flexibility it provides. For example, we can perform low-rank regression. Or, we may constrain two-way interactions between direct axis neighbors by using band matrices for the $\mat{\Omega}_k$'s, among others.
 
-This flexibility derives from many different matrix manifolds that can be used as building blocks $\manifold{B}_k$ and $\manifold{O}_k$ of the parameter space $\Theta$ in \cref{thm:param-manifold}. A list of possible choices, among others, is given in \cref{tab:matrix-manifolds}. As long as parameters in $\Theta$ are valid paramererization of a density (or PMF) of \eqref{eq:quadratic-exp-fam} subject to \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold}, one may choose any of the manifolds listed in \cref{tab:matrix-manifolds} which are either cones or spherical. We also included an example which is neither a sphere nor a cone. They may also be valid building blocks, but require more work as they are not directly leading to a parameter manifold by \cref{thm:param-manifold}. In case one can show the resulting parameter space $\Theta$ is an embedded manifold, the asymptotic theory of \cref{sec:asymtotics} is applicable.
+This flexibility derives from many different matrix manifolds that can be used as building blocks $\manifold{B}_k$ and $\manifold{O}_k$ of the parameter space $\Theta$ in \cref{thm:param-manifold}. A list of possible choices, among others, is given in \cref{tab:matrix-manifolds}. As long as parameters in $\Theta$ are a valid parameterization of a density (or PMF) of \eqref{eq:quadratic-exp-fam} subject to \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold}, one may choose any of the manifolds listed in \cref{tab:matrix-manifolds} which are either cones or spherical. We also included an example which is neither a sphere nor a cone. They may also be valid building blocks but require more work as they are not directly leading to a parameter manifold by \cref{thm:param-manifold}. In case one can show the resulting parameter space $\Theta$ is an embedded manifold, the asymptotic theory of \cref{sec:asymtotics} is applicable.
 
 \begin{table}
     \centering
@@ -843,7 +798,7 @@ This flexibility derives from many different matrix manifolds that can be used a
 \section{Statistical Properties}
 \subsection{Asymptotics}\label{sec:asymtotics}
 
-Let $Z$ be a random variable distributed according to a parameterized probability distribution with density $f_{\mat{\theta_0}}\in\{ f_{\mat{\theta}} : \mat{\theta}\in\Theta \}$ where $\Theta$ is a subset of a Euclidean space. We want to estimate the parameter ${\mat{\theta}}_0$ using $n$ i.i.d. (independent and identically distributed) copies of $Z$. We assume a known, real-valued and measurable function $z\mapsto m_{\mat{\theta}}(z)$ for every $\mat{\theta}\in\Theta$ and that  ${\mat{\theta}}_0$ is the unique maximizer of the map $\mat{\theta}\mapsto M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$. For the estimation we maximize the empirical version
+Let $Z$ be a random variable distributed according to a parameterized probability distribution with density $f_{\mat{\theta_0}}\in\{ f_{\mat{\theta}}: \mat{\theta}\in\Theta \}$ where $\Theta$ is a subset of a Euclidean space. We want to estimate the parameter ${\mat{\theta}}_0$ using $n$ i.i.d. (independent and identically distributed) copies of $Z$. We assume a known, real-valued and measurable function $z\mapsto m_{\mat{\theta}}(z)$ for every $\mat{\theta}\in\Theta$ and that  ${\mat{\theta}}_0$ is the unique maximizer of the map $\mat{\theta}\mapsto M(\mat{\theta}) = \E m_{\mat{\theta}}(Z)$. For the estimation we maximize the empirical version
 \begin{align}\label{eq:Mn}
     M_n(\mat{\theta}) &= \frac{1}{n}\sum_{i = 1}^n m_{\mat{\theta}}(Z_i).
 \end{align}
@@ -915,7 +870,7 @@ for every non-empty compact $K\subset\Xi$.     Then, there exists a strong M-est
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Simulations}\label{sec:simulations}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-In this section we report simulation results for the tensor normal and the Ising model where different aspects of the GMLM model are compared against other methods. The comparison methods are Tensor Sliced Inverse Regression (TSIR) \parencite{DingCook2015}, Multiway Generalized Canonical Correlation Analysis (MGCCA) \parencite{ChenEtAl2021,GirkaEtAl2024} and the Tucker decomposition that is a higher-order form of principal component analysis (HOPCA) \textcite{KoldaBader2009}, for both continuous and binary data. For the latter, the binary values are  treated as continuous. As part of our baseline analysis, we also incorporate traditional Principal Component Analysis (PCA) on vectorized observations. In the case of the Ising model, we also compare with LPCA (Logistic PCA) and CLPCA (Convex Logistic PCA), both introduced in \textcite{LandgrafLee2020}. All experiments are performed at sample sizes $n = 100, 200, 300, 500$ and $750$. Every experiment is repeated $100$ times.
+In this section, we report simulation results for the multi-linear normal and the multi-linear Ising model where different aspects of the GMLM model are compared against other methods. The comparison methods are Tensor Sliced Inverse Regression (TSIR) \parencite{DingCook2015}, Multiway Generalized Canonical Correlation Analysis (MGCCA) \parencite{ChenEtAl2021,GirkaEtAl2024} and the Tucker decomposition that is a higher-order form of principal component analysis (HOPCA) \textcite{KoldaBader2009}, for both continuous and binary data. For the latter, the binary values are treated as continuous. As part of our baseline analysis, we also incorporate traditional Principal Component Analysis (PCA) on vectorized observations. In the case of the Ising model, we also compare with LPCA (Logistic PCA) and CLPCA (Convex Logistic PCA), both introduced in \textcite{LandgrafLee2020}. All experiments are performed at sample sizes $n = 100, 200, 300, 500$ and $750$. Every experiment is repeated $100$ times.
 
 We are interested in the quality of the estimate of the true sufficient reduction $\ten{R}(\ten{X})$ from \cref{thm:sdr}. Therefore, we compare with the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, as it is compatible with any linear reduction method. The distance $d(\mat{B}, \hat{\mat{B}})$ between $\mat{B}\in\mathbb{R}^{p\times q}$ and an estimate $\hat{\mat{B}}\in\mathbb{R}^{p\times \tilde{q}}$ is the \emph{subspace distance} which is proportional to
 \begin{displaymath}
@@ -924,16 +879,13 @@ We are interested in the quality of the estimate of the true sufficient reductio
 the Frobenius norm of the difference between the projections onto the span of $\mat{B}$ and $\hat{\mat{B}}$. The proportionality constant\footnote{Depends on row dimension $p$ and the ranks of $\mat{B}$ and $\hat{\mat{B}}$ given by $(\min(\rank\mat{B} + \rank\hat{\mat{B}}, 2 p - (\rank\mat{B} + \rank\hat{\mat{B}})))^{-1/2}$.} of $d(\mat{B}, \hat{\mat{B}})$ ensures that the subspace distance is in the interval $[0, 1]$. A distance of zero implies space overlap, a distance of one means that the subspaces are orthogonal.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Tensor Normal}\label{sec:sim-tensor-normal}
+\subsection{Multi-Linear Normal}\label{sec:sim-tensor-normal}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-We generate a random sample $y_i$, $i=1,\ldots, n$,  from the standard normal distribution. We then draw i.i.d. samples $\ten{X}_i$ for $i = 1, ..., n$ from the conditional tensor normal distribution of $\ten{X}\mid Y = y_i$. The conditional distribution $\ten{X}\mid Y = y_i$ depends on the choice of the GMLM parameters $\overline{\ten{\eta}}$, $\mat{\beta}_1, ..., \mat{\beta}_r$, $\mat{\Omega}_1, ..., \mat{\Omega}_r$, and the function $\ten{F}_y$ of $y$. In all experiments we set $\overline{\ten{\eta}} = \mat{0}$. The other parameters and $\ten{F}_y$ are described per experiment. With the true GMLM parameters and $\ten{F}_y$ given, we compute the conditional tensor normal mean $\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k$ and covariances $\mat{\Sigma}_k = \mat{\Omega}_k^{-1}$ as in \eqref{eq:tnormal_cond_params}.
+We generate a random sample $y_i$, $i=1,\ldots, n$,  from the standard normal distribution. We then draw i.i.d. samples $\ten{X}_i$ for $i = 1, ..., n$ from the conditional multi-linear normal distribution of $\ten{X}\mid Y = y_i$. The conditional distribution $\ten{X}\mid Y = y_i$ depends on the choice of the GMLM parameters $\overline{\ten{\eta}}$, $\mat{\beta}_1, ..., \mat{\beta}_r$, $\mat{\Omega}_1, ..., \mat{\Omega}_r$, and the function $\ten{F}_y$ of $y$. In all experiments we set $\overline{\ten{\eta}} = \mat{0}$. The other parameters and $\ten{F}_y$ are described per experiment. With the true GMLM parameters and $\ten{F}_y$ given, we compute the conditional multi-linear normal mean $\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k$ and covariances $\mat{\Sigma}_k = \mat{\Omega}_k^{-1}$ as in \eqref{eq:tnormal_cond_params}.
 
 We consider the following settings: 
-%We start with a $1$ dimensional linear dependence on $y$ in 1a). Then, the dependence of $y$ is via a cubic polynomial 1b-d). In 1b) reduction is full rank, in contrast to 1c) where the $\mat{\beta}_k$'s are of rank $1$, in other words, low rank regression. In 1d) we constrain the inverse covariances $\mat{\Omega}_k$ to be tri-diagonal. Both, 1c-d) are examples of building the parameter space according to \cref{thm:param-manifold}. The final tensor normal experiment 1e) is a model misspecification. The true model does \emph{not} have a Kronecker structure and the ``known'' function $\ten{F}_y$ of $y$ is misspecified as well.
-
 \begin{itemize}
-    \item[1a)] $\ten{X}$ is a three-way ($r = 3$) array of dimension  $2\times 3\times 5$,  %The dependence through the inverse regression model is linear specifically means that 
-    and $\ten{F}_y\equiv y$ is a $1\times 1\times 1$ tensor. The true $\mat{\beta}_k$'s are all equal to $\mat{e}_1\in\mathbb{R}^{p_k}$, the first unit vector, for $k \in \{1, 2, 3\}$. The matrices $\mat{\Omega}_k = \mathrm{AR}(0.5)$ follow an auto-regression like structure. That is, the elements are given by $(\mat{\Omega}_k)_{i j} = 0.5^{|i - j|}$.
+    \item[1a)] $\ten{X}$ is a three-way ($r = 3$) array of dimension  $2\times 3\times 5$, and $\ten{F}_y\equiv y$ is a $1\times 1\times 1$ tensor. The true $\mat{\beta}_k$'s are all equal to $\mat{e}_1\in\mathbb{R}^{p_k}$, the first unit vector, for $k \in \{1, 2, 3\}$. The matrices $\mat{\Omega}_k = \mathrm{AR}(0.5)$ follow an auto-regression like structure. That is, the elements are given by $(\mat{\Omega}_k)_{i j} = 0.5^{|i - j|}$.
     \item[1b)] $\ten{X}$ is a three-way ($r = 3$) array of dimension  $2\times 3\times 5$, and relates to the response $y$ via a qubic polynomial. This is modeled via $\ten{F}_y$ of dimension $2\times 2\times 2$ by the twice iterated outer product of the vector $(1, y)$. Element wise this reads $(\ten{F}_y)_{i j k} = y^{i + j + k - 3}$. All $\mat{\beta}_k$'s are set to $(\mat{e}_1, \mat{e}_2)\in\mathbb{R}^{p_k\times 2}$ with $\mat{e}_i$ the $i$th unit vector and the $\mat{\Omega}_k$'s are $\mathrm{AR}(0.5)$.
     \item[1c)] Same as 1b), except that the GMLM parameters $\mat{\beta}_k$ are rank $1$ given by
         \begin{displaymath}
@@ -942,33 +894,32 @@ We consider the following settings:
             \mat{\beta}_3 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \\ 1 & -1 \\ -1 & 1 \\ 1 & -1 \end{pmatrix}.
         \end{displaymath}
     \item[1d)] Same as 1b), but  the true $\mat{\Omega}_k$ is tri-diagonal, for $k = 1, 2, 3$. Their elements are given by $(\mat{\Omega}_k)_{i j} = \delta_{0, |i - j|} + 0.5\delta_{1, |i - j|}$ with $\delta_{i, j}$ being the Kronecker delta.
-    \item[1e)] For the misspecification model we let $\ten{X}\mid Y$ be multivariate  but \emph{not} tensor normal. Let $\ten{X}$ be a $5\times 5$ random matrix with normal entries, $Y$ univariate standard normal and $\mat{f}_y$ a $4$ dimensional vector given by $\mat{f}_y = (1, \sin(y), \cos(y), \sin(y)\cos(y))$. The true vectorized reduction matrix $\mat{B}$ is $25\times 4$ consisting of the first $4$ columns of the identity; i.e., $\mat{B}_{i j} = \delta_{i j}$. The variance-covariance matrix $\mat{\Sigma}$ has elements $\mat{\Sigma}_{i j} = 0.5^{|i - j|}$. %is an auto-regression like structure with correlation coefficient $0.5$. 
-    Both, $\mat{B}$ and $\mat{\Omega} = \mat{\Sigma}^{-1}$ violate the Kronecker product assumptions \eqref{eq:eta1} and \eqref{eq:eta2} of the GMLM model. Then, we set
+    \item[1e)] For the misspecification model we let $\ten{X}\mid Y$ be multivariate  but \emph{not} multi-linear normal. Let $\ten{X}$ be a $5\times 5$ random matrix with normal entries, $Y$ univariate standard normal and $\mat{f}_y$ a $4$ dimensional vector given by $\mat{f}_y = (1, \sin(y), \cos(y), \sin(y)\cos(y))$. The true vectorized reduction matrix $\mat{B}$ is $25\times 4$ consisting of the first $4$ columns of the identity; i.e., $\mat{B}_{i j} = \delta_{i j}$. The variance-covariance matrix $\mat{\Sigma}$ has elements $\mat{\Sigma}_{i j} = 0.5^{|i - j|}$. Both, $\mat{B}$ and $\mat{\Omega} = \mat{\Sigma}^{-1}$ violate the Kronecker product assumptions \eqref{eq:eta1} and \eqref{eq:eta2} of the GMLM model. Then, we set
     \begin{displaymath}
         \vec{\ten{X}}\mid (Y = y) = \mat{B}\mat{f}_y + \mathcal{N}_{25}(\mat{0}, \mat{\Sigma}).
     \end{displaymath}
     Furthermore, we fit the model with the wrong ``known'' function $\ten{F}_y$. We set $\ten{F}_y$ to be a $2\times 2$ matrix with $(\ten{F}_y)_{i j} = y^{i + j - 2}$, $i,j=1,2$.
 \end{itemize}
 
-The final tensor normal experiment 1e) is a misspecified model to explore the robustness of our approach. The true model does \emph{not} have a Kronecker structure and the ``known'' function $\ten{F}_y$ of $y$ is misspecified as well.
+The final multi-linear normal experiment 1e) is a misspecified model to explore the robustness of our approach. The true model does \emph{not} have a Kronecker structure and the ``known'' function $\ten{F}_y$ of $y$ is misspecified as well.
 
 
 \begin{figure}[hp!]
     \centering
     \includegraphics[width = \textwidth]{plots/sim-normal.pdf}
-    \caption{\label{fig:sim-normal}Visualization of the simulation results for the tensor normal GMLM. Sample size on the $x$-axis and the mean of subspace distance $d(\mat{B}, \hat{\mat{B}})$ over $100$ replications on the $y$-axis. Described in \cref{sec:sim-tensor-normal}.}
+    \caption{\label{fig:sim-normal}Visualization of the simulation results for the multi-linear normal GMLM. Sample size on the $x$-axis and the mean of subspace distance $d(\mat{B}, \hat{\mat{B}})$ over $100$ replications on the $y$-axis. Described in \cref{sec:sim-tensor-normal}.}
 \end{figure}
 
 
-The results are visualized in \cref{fig:sim-normal}. Simulation 1a), given a 1D linear relation between the response $Y$ and $\ten{X}$, TSIR and GMLM are equivalent. This is expected as \textcite{DingCook2015} already established that TSIR gives the MLE estimate under a tensor (matrix) normal distributed setting. For the other methods, MGCCA is only a bit better than PCA which, unexpectedly, beats HOPCA. But none of them are close to the performance of TSIR or GMLM. Continuing with 1b), where we introduced a cubic relation between $Y$ and $\ten{X}$, we observe a bigger deviation in the performance of GMLM and TSIR. This is caused mainly because we are estimating an $8$ dimensional subspace now, which amplifies the small performance boost, in the subspace distance, we gain by avoiding slicing.  The GMLM model in 1c) behaves as expected, clearly being the best. The other results are surprising. First, PCA, HOPCA and MGCCA are visually indistinguishable. This is explained by a high signal-to-noise ratio in this particular example. But the biggest surprise is the failure of TSIR. Even more surprising is that the conditional distribution $\ten{X}\mid Y$ is tensor normal distributed which, in conjunction with $\cov(\vec\ten{X})$ having a Kronecker structure, should give the MLE estimate. The low-rank assumption is also not an issue, this simply relates to TSIR estimating a 1D linear reduction which fulfills all the requirements. Finally, a common known issue of slicing, used in TSIR, is that conditional multi-modal distributions can cause estimation problems due to the different distribution modes leading to vanishing slice means. Again, this is not the case in simulation 1c).
-An investigation into this behaviour revealed the problem in the estimation of the mode covariance matrices $\mat{O}_k = \E[(\ten{X} - \E\ten{X})_{(k)}\t{(\ten{X} - \E\ten{X})_{(k)}}]$. The mode wise reductions provided by TSIR are computed as $\hat{\mat{O}}_k^{-1}\hat{\mat{\Gamma}}_k$ where the poor estimation of $\hat{\mat{O}}_k$ causes the failure of TSIR. The poor estimate of $\mat{O}_k$ is rooted in the high signal to noise ratio in this particular simulation. GMLM does not have degenerate behaviour for high signal to noise ratios but it is less robust in low signal to noise ratio setting where TSIR performs better in this specific example.
-Simulation 1d), incorporating information about the covariance structure behaves similar to 1b), except that GMLM gains a statistically significant lead in estimation performance. The last simulation, 1e), where the model was misspecified for GMLM. GMLM, TSIR as well as MGCCA are on par where GMLM has a sligh lead in the small sample size setting and MGCCA overtakes in higher sample scenarios. The PCA and HOPCA methods both still outperformed. A wrong assumption about the relation to the response is still better than no relation at all.
+The results are visualized in \cref{fig:sim-normal}. Simulation 1a), given a 1D linear relation between the response $Y$ and $\ten{X}$, TSIR and GMLM are equivalent. This is expected as \textcite{DingCook2015} already established that TSIR gives the MLE estimate under a multi-linear (matrix) normal distributed setting. For the other methods, MGCCA is only a bit better than PCA which, unexpectedly, beats HOPCA. But none of them are close to the performance of TSIR or GMLM. Continuing with 1b), where we introduced a cubic relation between $Y$ and $\ten{X}$, we observe a bigger deviation in the performance of GMLM and TSIR. This is caused mainly because we are estimating an $8$ dimensional subspace now, which amplifies the small performance boost, in the subspace distance, we gain by avoiding slicing.  The GMLM model in 1c) behaves as expected, clearly being the best. The other results are surprising. First, PCA, HOPCA and MGCCA are visually indistinguishable. This is explained by a high signal-to-noise ratio in this particular example. But the biggest surprise is the failure of TSIR. Even more surprising is that the conditional distribution $\ten{X}\mid Y$ is multi-linear normal distributed which, in conjunction with $\cov(\vec\ten{X})$ having a Kronecker structure, should give the MLE estimate. The low-rank assumption is also not an issue, this simply relates to TSIR estimating a 1D linear reduction which fulfills all the requirements. Finally, a common known issue of slicing, used in TSIR, is that conditional multi-modal distributions can cause estimation problems due to the different distribution modes leading to vanishing slice means. Again, this is not the case in simulation 1c).
+An investigation into this behavior revealed the problem in the estimation of the mode covariance matrices $\mat{O}_k = \E[(\ten{X} - \E\ten{X})_{(k)}\t{(\ten{X} - \E\ten{X})_{(k)}}]$. The mode wise reductions provided by TSIR are computed as $\hat{\mat{O}}_k^{-1}\hat{\mat{\Gamma}}_k$ where the poor estimation of $\hat{\mat{O}}_k$ causes the failure of TSIR. The poor estimate of $\mat{O}_k$ is rooted in the high signal-to-noise ratio in this particular simulation. GMLM does not have degenerate behavior for high signal-to-noise ratios but it is less robust in low signal-to-noise ratio setting where TSIR performs better in this specific example.
+Simulation 1d), incorporating information about the covariance structure behaves similarly to 1b), except that GMLM gains a statistically significant lead in estimation performance. The last simulation, 1e), where the model was misspecified for GMLM. GMLM, TSIR, as well as MGCCA, are on par where GMLM has a slight lead in the small sample size setting and MGCCA overtakes in higher sample scenarios. The PCA and HOPCA methods both still outperformed. A wrong assumption about the relation to the response is still better than no relation at all.
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Ising Model}\label{sec:sim-ising}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-Assuming for $\ten{X}$ being a $2\times 3$ dimensional binary matrix with conditional matrix (tensor) Ising distribution $\ten{X}\mid Y$ as in \cref{sec:ising_estimation}. We let for $i = 1, \ldots, n$ the response being i.i.d. uniformly distributed in $[-1, 1]$ establishing the conditional value in the i.i.d. samples from $\ten{X}_i\mid Y = y_i$ with GMLM parameters $\mat{\beta}_1, \mat{\beta}_2$, $\mat{\Omega}_1, \mat{\Omega}_2$. We let
+Assuming for $\ten{X}$ being a $2\times 3$ dimensional binary matrix with conditional matrix (multi-linear) Ising distribution $\ten{X}\mid Y$ as in \cref{sec:ising_estimation}. We let for $i = 1, \ldots, n$ the response being i.i.d. uniformly distributed in $[-1, 1]$ establishing the conditional value in the i.i.d. samples from $\ten{X}_i\mid Y = y_i$ with GMLM parameters $\mat{\beta}_1, \mat{\beta}_2$, $\mat{\Omega}_1, \mat{\Omega}_2$. We let
 \begin{displaymath}
     \mat{\beta}_1 = \begin{pmatrix}
         1 & 0 \\ 0 & 1
@@ -1006,7 +957,7 @@ if not mentioned otherwise in a specific simulation setup given next.
             0 & 0 \\ 1 & -1 \\ 0 & 0
         \end{pmatrix}.
     \end{displaymath}
-    \item[2d)] We conclude with a simulation relating to the original design of the Ising model. It is a mathematical model to study the behaviour of Ferromagnetism \textcite{Ising1925} in a thermodynamic setting modeling the interaction effects of elementary magnets (spin up/down relating to $0$ and $1$). The model assumes all elementary magnets to be the same, which translates to all having the same coupling strength (two-way interactions) governed by a single parameter relating to the temperature of the system. Assuming the magnets to be arranged in a 2D grid (matrix valued $\ten{X}$), their interactions are constraint to direct neighbours. We can model this by choosing the true $\mat{\Omega}_k$'s to be tri-diagonal matrices with zero diagonal entries and all non-zero entries identical. Since this is a 1D matrix manifold, we can enforce the constraint. Setting the true interaction parameters to be
+    \item[2d)] We conclude with a simulation relating to the original design of the Ising model. It is a mathematical model to study the behavior of Ferromagnetism \textcite{Ising1925} in a thermodynamic setting modeling the interaction effects of elementary magnets (spin up/down relating to $0$ and $1$). The model assumes all elementary magnets to be the same, which translates to all having the same coupling strength (two-way interactions) governed by a single parameter relating to the temperature of the system. Assuming the magnets to be arranged in a 2D grid (matrix-valued $\ten{X}$), their interactions are constrained to direct neighbors. We can model this by choosing the true $\mat{\Omega}_k$'s to be tri-diagonal matrices with zero diagonal entries and all non-zero entries identical. Since this is a 1D matrix manifold, we can enforce the constraint. Setting the true interaction parameters to be
     \begin{displaymath}
         \mat{\Omega}_1 = \frac{1}{2}\begin{pmatrix}
             0 & 1 \\ 1 & 0
@@ -1025,10 +976,10 @@ if not mentioned otherwise in a specific simulation setup given next.
     \caption{\label{fig:sim-ising}Visualization of the simulation results for Ising GMLM. Sample size on the $x$-axis and the mean of subspace distance $d(\mat{B}, \hat{\mat{B}})$ over $100$ replications on the $y$-axis. Described in \cref{sec:sim-ising}.}
 \end{figure}
 
-The simulation results are visualized in \cref{fig:sim-ising}. Regardless of the simulation setting 2a-d), the comparative results are similar. We observe that PCA and HOPCA, both treating the response $\ten{X}$ as continuous, perform poorly. Not much better are LPCA and CLPCA. Similar to PCA and HOPCA they do not consider the relation to the response, but they are specifically created for binary predictors. Next we have MGCCA which is the first method considering the relation to the response $y$, clearly out-performing all the PCA variants. Even better is TSIR, regardless of the treatment of the predictors $\ten{X}$ as continuous, achieving very good results. Finally, the Ising GMLM model is the best in all the simulations although TSIR gets very close in some settings.
+The simulation results are visualized in \cref{fig:sim-ising}. Regardless of the simulation setting 2a-d), the comparative results are similar. We observe that PCA and HOPCA, both treating the response $\ten{X}$ as continuous, perform poorly. Not much better are LPCA and CLPCA. Similar to PCA and HOPCA they do not consider the relation to the response, but they are specifically created for binary predictors. Next, we have MGCCA which is the first method considering the relation to the response $y$, clearly out-performing all the PCA variants. Even better is TSIR, regardless of the treatment of the predictors $\ten{X}$ as continuous, achieving very good results. Finally, the Ising GMLM model is the best in all the simulations although TSIR gets very close in some settings.
 
-% Due to the surprisingly good result of TSIR, we also applied the tensor normal GMLM model to the exact same simulation, simply treating the response $\ten{X}$ as continuous.
-% The raw linear reduction estimates of both the Ising GMLM and the tensor normal GMLM are basically indistinguishable, similar to the very similar results of the different PCA variants. The main reason for this specific
+% Due to the surprisingly good result of TSIR, we also applied the multi-linear normal GMLM model to the exact same simulation, simply treating the response $\ten{X}$ as continuous.
+% The raw linear reduction estimates of both the Ising GMLM and the multi-linear normal GMLM are basically indistinguishable, similar to the very similar results of the different PCA variants. The main reason for this specific
 
 % \begin{table}
 %     \begin{tabular}{c | ccc ccc c}
@@ -1048,20 +999,20 @@ The simulation results are visualized in \cref{fig:sim-ising}. Regardless of the
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Data Analysis}\label{sec:data-analysis}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-In this section we perform two applications of the GMLM model on real data. First example is the tensor normal model applied to EEG data. Next, we perform a prove of concept data analysis example for chess.
+In this section, we perform two applications of the GMLM model on real data. The first example is the multi-linear normal model applied to EEG data. Next, we perform a proof of concept data analysis example for chess.
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{EEG}
-The EEG data (\url{http://kdd.ics.uci.edu/databases/eeg/eeg.data.html}) is a small study of $77$ alcoholic and $45$ control subjects. Each data point corresponding to a subject consists of a $p_1\times p_2 = 256\times 64$ matrix, with each row representing a time point and each column a channel. The measurements were obtained by exposing each individual to visual stimuli and measuring voltage values from $64$ electrodes placed on the subjects' scalps sampled at $256$ time points over $1$ second ($256$ Hz). Different stimulus conditions were used, and for each condition, $120$ trials were measured. We used only a single stimulus condition (S1), and for each subject, we took the average of all the trials under that condition. That is, we used $(\ten{X}_i, y_i)$, $i = 1, \ldots, 122$, where $\ten{X}_i$ is a $256\times 64$ matrix, with each entry representing the mean voltage value of subject $i$ at a combination of a time point and a channel, averaged over all trials under the S1 stimulus condition, and $Y$ is a binary outcome variable with $Y_i = 1$ for an alcoholic and $Y_i = 0$ for a control subject.
+The EEG data\footnote{\fullcite{eeg-dataset}} is a small study of $77$ alcoholic and $45$ control subjects. Each data point corresponding to a subject consists of a $p_1\times p_2 = 256\times 64$ matrix, with each row representing a time point and each column a channel. The measurements were obtained by exposing each individual to visual stimuli and measuring voltage values from $64$ electrodes placed on the subjects' scalps sampled at $256$ time points over $1$ second ($256$ Hz). Different stimulus conditions were used, and for each condition, $120$ trials were measured. We used only a single stimulus condition (S1), and for each subject, we took the average of all the trials under that condition. That is, we used $(\ten{X}_i, y_i)$, $i = 1, \ldots, 122$, where $\ten{X}_i$ is a $256\times 64$ matrix, with each entry representing the mean voltage value of subject $i$ at a combination of a time point and a channel, averaged over all trials under the S1 stimulus condition, and $Y$ is a binary outcome variable with $Y_i = 1$ for an alcoholic and $Y_i = 0$ for a control subject.
 
-For a comparison we reproduced the leave-one-out cross-validation EEG data analysis \textcite[Sec. 7]{PfeifferKaplaBura2021} for the classification task. In this data set, $p= p_1 p_2 = 16384$ is much larger than $n=122$. To deal with this issue, \textcite{PfeifferKaplaBura2021} used two approaches. In the first, pre-screening via (2D)$^2$PCA \parencite{ZhangZhou2005} reduced the dimensions to $(p_1, p_2) = (3, 4)$, $(15, 15)$ and $(20, 30)$. In the second, simultaneous dimension reductions and variable selection was carried out using the fast POI-C algorithm of \textcite{JungEtAl2019} (due to high computational high burden, only a 10-fold cross-validation was performed for fast POI-C).
+For a comparison, we reproduced the leave-one-out cross-validation EEG data analysis \textcite[Sec. 7]{PfeifferKaplaBura2021} for the classification task. In this data set, $p= p_1 p_2 = 16384$ is much larger than $n=122$. To deal with this issue, \textcite{PfeifferKaplaBura2021} used two approaches. In the first, pre-screening via (2D)$^2$PCA \parencite{ZhangZhou2005} reduced the dimensions to $(p_1, p_2) = (3, 4)$, $(15, 15)$ and $(20, 30)$. In the second, simultaneous dimension reduction and variable selection were carried out using the fast POI-C algorithm of \textcite{JungEtAl2019} (due to high computational burden, only 10-fold cross-validation was performed for fast POI-C).
 
-In contrast to \textcite{PfeifferKaplaBura2021}, our GMLM model can be applied directly to the raw data of dimension $(256, 64)$ without pre-screening or variable selection. This was not possible for K-PIR as the time axis alone was in the large $p$ small $n$ regime with the $p_1 = 256 > n = 122$ leading to a singular time axis covariance. The same issue is present in the GMLM model, but the regularization trick used for numerical stability, as described in \cref{sec:tensor-normal-estimation}, resolves this without any change to the estimation procedure. In general, the sample size does not need to be large for maximum likelihood estimation in the tensor normal model. In matrix normal models in particular, \cite{DrtonEtAl2020} proved that very small sample sizes, as little as $3$,\footnote{The required minimum sample size depends on a non-trivial algebraic relations between the mode dimensions, while the magnitude of the dimensions has no specific role.} are sufficient to obtain unique MLEs for Kronecker covariance structures.
+In contrast to \textcite{PfeifferKaplaBura2021}, our GMLM model can be applied directly to the raw data of dimension $(256, 64)$ without pre-screening or variable selection. This was not possible for K-PIR as the time axis alone was in the large $p$ small $n$ regime with the $p_1 = 256 > n = 122$ leading to a singular time axis covariance. The same issue is present in the GMLM model, but the regularization trick used for numerical stability, as described in \cref{sec:tensor-normal-estimation}, resolves this without any change to the estimation procedure. In general, the sample size does not need to be large for maximum likelihood estimation in the multi-linear normal model. In matrix normal models in particular, \cite{DrtonEtAl2020} proved that very small sample sizes, as little as $3$,\footnote{The required minimum sample size depends on non-trivial algebraic relations between the mode dimensions, while the magnitude of the dimensions has no specific role.} are sufficient to obtain unique MLEs for Kronecker covariance structures.
 
 We use leave-one-out cross-validation to obtain unbiased AUC estimates. Then, we compare the GMLM model to the best performing methods from \textcite{PfeifferKaplaBura2021}, namely K-PIR (ls) and LSIR from \textcite{PfeifferForzaniBura2012} for $(p_1, p_2) = (3, 4)$, $(15, 15)$ and $(20, 30)$.
 
-In \cref{tab:eeg} we provide the AUC and its standard deviation. For all applied pre-screening dimensions, K-PIR (ls) has an AUC of $78\%$. LSIR performs better at the price of some instability; it peaked at $85\%$ at $(3, 4)$, then dropped down to $81\%$ at $(15, 15)$ and then increased to $83\%$.  In contract, our GMLM method peaked at $(3, 4)$ with $85\%$ and stayed stable at $84\%$, even when no pre-processing was applied. In contrast,  fast POI-C that carries out simultaneous feature extraction and feature selection resulted in an AUC of $63\%$, clearly outperformed by all other methods.
+In \cref{tab:eeg} we provide the AUC and its standard deviation. For all applied pre-screening dimensions, K-PIR (ls) has an AUC of $78\%$. LSIR performs better at the price of some instability; it peaked at $85\%$ at $(3, 4)$, then dropped down to $81\%$ at $(15, 15)$, and then increased to $83\%$.  In contrast, our GMLM method peaked at $(3, 4)$ with $85\%$ and stayed stable at $84\%$, even when no pre-processing was applied. In contrast,  fast POI-C that carries out simultaneous feature extraction and feature selection resulted in an AUC of $63\%$, clearly outperformed by all other methods.
 
 \begin{table}[!hpt]
     \centering
@@ -1086,9 +1037,9 @@ In \cref{tab:eeg} we provide the AUC and its standard deviation. For all applied
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Chess}\label{sec:chess}
-The data set is provided by the \citetitle{lichess-database}\footnote{\fullcite{lichess-database}}. We randomly selected the November of 2023 data that consist of more than $92$ million games. We removed all games without position evaluations. The evaluations, also denoted as scores, are from Stockfish\footnote{\fullcite{stockfish}}, a free and strong chess engine. The scores take the role of the response $Y$ and correspond to a winning probability from the white pieces point of view. Positive scores are good for white and negative scores indicate an advantage for black pieces. We ignore all highly unbalanced positions, which we set to be positions with absolute score above $5$. We also remove all positions with a mate score (one side can force checkmate). Furthermore, we only consider positions after $10$ half-moves to avoid oversampling the beginning of the most common openings including the start position which is in every game. Finally, we only consider positions with white to move. This leads to a final data set of roughly $64$ million positions, including duplicates.
+The data set is provided by the \citetitle{lichess-database}\footnote{\fullcite{lichess-database}}. We randomly selected the November of 2023 data that consist of more than $92$ million games. We removed all games without position evaluations. The evaluations, also denoted as scores, are from Stockfish\footnote{\fullcite{stockfish}}, a free and strong chess engine. The scores take the role of the response $Y$ and correspond to a winning probability from the white pieces' point of view. Positive scores are good for white and negative scores indicate an advantage for black pieces. We ignore all highly unbalanced positions, which we set to be positions with absolute score above $5$. We also remove all positions with a mate score (one side can force checkmate). Furthermore, we only consider positions after $10$ half-moves to avoid oversampling the beginning of the most common openings including the start position which is in every game. Finally, we only consider positions with white to move. This leads to a final data set of roughly $64$ million positions, including duplicates.
 
-A chess position is encoded as a set of $12$ binary matrices $\ten{X}_{\mathrm{piece}}$ of dimensions $8\times 8$. Every binary matrix encodes the positioning of a particular piece by containing a $1$ if the piece is present at the corresponding board position. The $12$ pieces derive from the $6$ types of pieces, namely pawns (\pawn), knights (\knight), bishops (\bishop), queens (\queen) and kings (\king) of two colors, black and white. See \cref{fig:fen2tensor} for a visualization.
+A chess position is encoded as a set of $12$ binary matrices $\ten{X}_{\mathrm{piece}}$ of dimensions $8\times 8$. Every binary matrix encodes the positioning of a particular piece by containing a $1$ if the piece is present at the corresponding board position. The $12$ pieces derive from the $6$ types of pieces, namely pawns (\pawn), knights (\knight), bishops (\bishop), queens (\queen), and kings (\king) of two colors, black and white. See \cref{fig:fen2tensor} for a visualization.
 
 \begin{figure}[hp!]
     \centering
@@ -1096,26 +1047,26 @@ A chess position is encoded as a set of $12$ binary matrices $\ten{X}_{\mathrm{p
     \caption{\label{fig:fen2tensor}The chess start position and its 3D binary tensor representation, empty entries are $0$.}
 \end{figure}
 
-We assume that $\ten{X}_{\mathrm{piece}}\mid Y = y$ follows an Ising GMLM model \cref{sec:ising_estimation} with different conditional piece predictors being independent. The independence assumption is for the sake of simplicity even though this is clearly not the case in the underlying true distribution. By this simplifying assumption we get a mixture model with the log-likelihood
+We assume that $\ten{X}_{\mathrm{piece}}\mid Y = y$ follows an Ising GMLM model \cref{sec:ising_estimation} with different conditional piece predictors being independent. The independence assumption is for the sake of simplicity even though this is not the case in the underlying true distribution. By this simplifying assumption we get a mixture model with the log-likelihood
 \begin{displaymath}
     l_n(\mat{\theta}) = \frac{1}{12}\sum_{\mathrm{piece}}l_n(\mat{\theta}_{\mathrm{piece}})
 \end{displaymath}
 where $l_n(\mat{\theta}_{\mathrm{piece}})$ is the Ising GMLM log-likelihood as in \cref{sec:ising_estimation} for $\ten{X}_{\mathrm{piece}}\mid Y = y$. For every component the same relation to the scores $y$ is modeled via a $2\times 2$ dimensional matrix valued function $\ten{F}_y$ consisting of the monomials $1, y, y^2$, specifically $(\ten{F}_y)_{i j} = y^{i + j - 2}$.
 
-By the raw scale of the data, millions of observations, it is computationally infeasible to compute the gradients on the entire data set. Simply using a computationally manageable subset is not an option. Due to the high dimension on binary data, which is $12$ times a $8\times 8$ for every observation giving a total dimension of $768$. The main issue is that a manageable subset, say one million observations, still leads to a degenerate data set. In our simplified mixture model, the pawns are a specific issue as there are multiple millions of different combinations of the $8$ pawns per color on the $6\times 8$ sub grid the pawns can be positioned. This alone does not allow to take a reasonable sized subset for estimation. The solution is to switch from a classic gradient based optimization to a stochastic version. This means that every gradient update uses a new random subset of the entire data set. Therefore, we draw independent random samples form the data consisting of $64$ million positions. The independence of samples derived from the independence of games, and every sample is drawn from a different game.
+By the raw scale of the data, millions of observations, it is computationally infeasible to compute the gradients on the entire data set. Simply using a computationally manageable subset is not an option. Due to the high dimension on binary data, which is $12$ times a $8\times 8$ for every observation giving a total dimension of $768$. The main issue is that a manageable subset, say one million observations, still leads to a degenerate data set. In our simplified mixture model, the pawns are a specific issue as there are multiple millions of different combinations of the $8$ pawns per color on the $6\times 8$ sub-grid where the pawns can be positioned. This alone does not allow us to take a reasonable sized subset for estimation. The solution is to switch from a classic gradient-based optimization to a stochastic version. This means that every gradient update uses a new random subset of the entire data set. Therefore, we draw independent random samples from the data consisting of $64$ million positions. The independence of samples is derived from the independence of games, and every sample is drawn from a different game.
 
 \paragraph{Validation:}
-Given the non-linear nature of the reduction, due to the quadratic matrix valued function $\ten{F}_y$ of the score $y$, we use a \emph{generalized additive model}\footnote{using the function \texttt{gam()} from the \texttt{R} package \texttt{mgcv}.} (GAM) to predict position scores from reduced positions. The reduced positions are $48$ dimensional continuous values by combining the $12$ mixture components from the $2\times 2$ matrix valued reductions per piece. The per piece reduction is
+Given the non-linear nature of the reduction, due to the quadratic matrix-valued function $\ten{F}_y$ of the score $y$, we use a \emph{generalized additive model}\footnote{using the function \texttt{gam()} from the \texttt{R} package \texttt{mgcv}.} (GAM) to predict position scores from reduced positions. The reduced positions are $48$ dimensional continuous values by combining the $12$ mixture components from the $2\times 2$ matrix-valued reductions per piece. The per-piece reduction is
 \begin{displaymath}
     \ten{R}(\ten{X}_{\mathrm{piece}}) = \mat{\beta}_{1,\mathrm{piece}}(\ten{X}_{\mathrm{piece}} - \E\ten{X}_{\mathrm{piece}})\t{\mat{\beta}_{2, \mathrm{piece}}}
 \end{displaymath}
-which gives the complete $48$ dimensional vectorized reduction by stacking the piece wise reductions
+which gives the complete $48$ dimensional vectorized reduction by stacking the piece-wise reductions
 \begin{displaymath}
     \vec{\ten{R}(\ten{X}})
         = (\vec{\ten{R}(\ten{X}_{\text{white pawn}})}, \ldots, \vec{\ten{R}(\ten{X}_{\text{black king}})})
         = \t{\mat{B}}\vec(\ten{X} - \E\ten{X}).
 \end{displaymath}
-The second line encodes all the piece wise reductions in a block diagonal full reduction matrix $\mat{B}$ of dimension $768\times 48$ which is applied to the vectorized 3D tensor $\ten{X}$ combining all the piece components $\ten{X}_{\mathrm{piece}}$ into a single tensor of dimension $8\times 8\times 12$. This is a reduction to $6.25\%$ of the original dimension. The $R^2$ statistic of the GAM fitted on $10^5$ new reduced samples is $R^2_{\mathrm{gam}}\approx 46\%$. A linear model on the reduced data achieves $R^2_{\mathrm{lm}}\approx 26\%$ which clearly shows the non-linear relation. On the other hand, the static evaluation of the \emph{Schach H\"ornchen}\footnote{Main authors personal chess engine.} engine, given the full position (\emph{not} reduced), achieves an $R^2_{\mathrm{hce}}\approx 52\%$. The $42\%$ are reasonably well compared to $51\%$ of the engine static evaluation which gets the original position and uses chess specific expect knowledge. Features the static evaluation includes, which are expected to be learned by the GMLM mixture model, are; \emph{material} (piece values) and \emph{piece square tables} (PSQT, preferred piece type positions). In addition, the static evaluation includes chess specific features like \emph{king safety}, \emph{pawn structure} or \emph{rooks on open files}. This lets us conclude that the reduction captures most of the relevant features possible, given the oversimplified modeling we performed.
+The second line encodes all the piece-wise reductions in a block diagonal full reduction matrix $\mat{B}$ of dimension $768\times 48$ which is applied to the vectorized 3D tensor $\ten{X}$ combining all the piece components $\ten{X}_{\mathrm{piece}}$ into a single tensor of dimension $8\times 8\times 12$. This is a reduction to $6.25\%$ of the original dimension. The $R^2$ statistic of the GAM fitted on $10^5$ new reduced samples is $R^2_{\mathrm{gam}}\approx 46\%$. A linear model on the reduced data achieves $R^2_{\mathrm{lm}}\approx 26\%$ which clearly shows the non-linear relation. On the other hand, the static evaluation of the \emph{Schach H\"ornchen}\footnote{Main author's chess engine.} engine, given the full position (\emph{not} reduced), achieves an $R^2_{\mathrm{hce}}\approx 52\%$. The $42\%$ are reasonably well compared to $51\%$ of the engine static evaluation which gets the original position and uses chess specific expert knowledge. Features the static evaluation includes, which are expected to be learned by the GMLM mixture model, are; \emph{material} (piece values) and \emph{piece square tables} (PSQT, preferred piece type positions). In addition, the static evaluation includes chess specific features like \emph{king safety}, \emph{pawn structure}, or \emph{rooks on open files}. This lets us conclude that the reduction captures most of the relevant features possible, given the oversimplified modeling we performed.
 
 \paragraph{Interpretation:} For a compact interpretation of the estimated reduction we construct PSQTs. To do so we use the linear model from the validation section. Then, we rewrite the combined linear reduction and linear model in terms of PSQTs. Let $\mat{B}$ be the $768\times 48$ full vectorized linear reduction. This is the block diagonal matrix with the $64\times 4$ dimensional per piece reductions $\mat{B}_{\mathrm{piece}} = \mat{\beta}^{\mathrm{piece}}_2\otimes\mat{\beta}^{\mathrm{piece}}_1$. Then, the linear model with coefficients $\mat{b}$ and intercept $a$ on the reduced data is given by
 \begin{equation}\label{eq:chess-lm}
@@ -1123,14 +1074,13 @@ The second line encodes all the piece wise reductions in a block diagonal full r
 \end{equation}
 with an unknown mean zero error term $\epsilon$ and treating the binary tensor $\ten{X}$ as continuous. Decomposing the linear model coefficients into blocks of $4$ gives per piece coefficients $\mat{b}_{\mathrm{piece}}$ which combine with the diagonal blocks $\mat{B}_{\mathrm{piece}}$ of $\mat{B}$ only. Rewriting \eqref{eq:chess-lm} gives
 \begin{align*}
-    y
-    &= a + \sum_{\mathrm{piece}}\t{(\mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}})}\vec(\ten{X}_{\mathrm{piece}} - \E\ten{X}_{\mathrm{piece}}) + \epsilon \\
+    y &= a + \sum_{\mathrm{piece}}\t{(\mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}})}\vec(\ten{X}_{\mathrm{piece}} - \E\ten{X}_{\mathrm{piece}}) + \epsilon \\
     &= \tilde{a} + \sum_{\mathrm{piece}}\langle
         \mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}},
         \vec(\ten{X}_{\mathrm{piece}})
     \rangle + \epsilon
 \end{align*}
-with a new intercept term $\tilde{a}$, which is of no interest to us. Finally, we enforce a color symmetry, using known mechanism from chess engines. Specifically, mirroring the position changes the sign of the score $y$. Here, mirroring reverses the rank (row) order, this is the image in a mirror behind a chess board. Let for every $\mat{C}_{\mathrm{piece}}$ be a $8\times 8$ matrix with elements $(\mat{C}_{\mathrm{piece}})_{i j} = (\mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}})_{i + 8 (j - 1)}$. And denote with $\mat{M}(\mat{A})$ the matrix mirror operation which reverses the row order of a matrix. Using this new notation allows to enforcing this symmetry leading to the new approximate linear relation
+with a new intercept term $\tilde{a}$, which is of no interest to us. Finally, we enforce color symmetry, using known mechanisms from chess engines. Specifically, mirroring the position changes the sign of the score $y$. Here, mirroring reverses the rank (row) order, this is the image in a mirror behind a chess board. Let for every $\mat{C}_{\mathrm{piece}}$ be a $8\times 8$ matrix with elements $(\mat{C}_{\mathrm{piece}})_{i j} = (\mat{B}_{\mathrm{piece}}\mat{b}_{\mathrm{piece}})_{i + 8 (j - 1)}$. And denote with $\mat{M}(\mat{A})$ the matrix mirror operation which reverses the row order of a matrix. Using this new notation allows to enforcement of this symmetry leading to the new approximate linear relation
 \begin{align*}
     y &= \tilde{a} + \sum_{\mathrm{piece}}\langle
         \mat{C}_{\mathrm{piece}},
@@ -1141,7 +1091,7 @@ with a new intercept term $\tilde{a}$, which is of no interest to us. Finally, w
         \ten{X}_{\text{white piece}} - \mat{M}(\ten{X}_{\text{white piece}})
     \rangle + \epsilon
 \end{align*}
-If for every piece type ($6$ types, \emph{not} distinguishing between color) holds $\mat{C}_{\text{white piece}} = -\mat{M}(\mat{C}_{\text{black piece}})$, then we have equality. In our case this is valid given that the estimates $\hat{\mat{C}}_{\mathrm{piece}}$ fulfill this property with a small error. The $6$ matrices $(\mat{C}_{\text{white piece}} - \mat{M}(\mat{C}_{\text{black piece}})) / 2$ are called \emph{piece square tables} (PSQT) which are visualized in \cref{fig:psqt}. The interpretation of those tables is straight forward. A high positive values (blue) means that it is usually good to have a piece of the corresponding type on that square while a high negative value (red) means the opposite. It needs to be considered that the PSQTs are for quiet positions only, that means all pieces are save in the sense that there is no legal capturing moves nore is the king in check.
+If for every piece type ($6$ types, \emph{not} distinguishing between color) holds $\mat{C}_{\text{white piece}} = -\mat{M}(\mat{C}_{\text{black piece}})$, then we have equality. In our case, this is valid given that the estimates $\hat{\mat{C}}_{\mathrm{piece}}$ fulfill this property with a small error. The $6$ matrices $(\mat{C}_{\text{white piece}} - \mat{M}(\mat{C}_{\text{black piece}})) / 2$ are called \emph{piece square tables} (PSQT) which are visualized in \cref{fig:psqt}. The interpretation of those tables is straightforward. A high positive value (blue) means that it is usually good to have a piece of the corresponding type on that square while a high negative value (red) means the opposite. It needs to be considered that the PSQTs are for quiet positions only, which means all pieces are save in the sense that there is no legal capturing moves nor is the king in check.
 
 \begin{figure}[hp!]
     \centering
@@ -1156,13 +1106,20 @@ Next, going over the PSQTs one by one, a few words about the preferred positions
 The results of our analysis in the previous paragraph agree with the configuration of the chess board most associated with observed chess game outcomes. This arrangement also aligns with the understanding of human chess players of an average configuration at any moment during the game.   
 
 \section{Discussion}
-We have addressed sufficient dimension reduction for tensor valued predictors for regression or classification problems. Proposing a generalized multilinear model modeling the inverse conditional distribution we provided a multilinear sufficient reduction with consistent and asymptotic normal parameters. Moreover, our ansatz for proving the asymptotic results required by leveraging manifolds as a basis for resolving the issue of unidentifiable parameters lead to an even more flexible modeling framework. This allows to build complex and potentially problem specific parameter spaces incorporating additional domain specific knownledge into the model.
+%We have addressed sufficient dimension reduction for tensor valued predictors for regression or classification problems. 
+We propose a generalized multi-linear model formulation for the inverse conditional distribution of a tensor-valued predictor given a response and derive a multi-linear sufficient reduction for the corresponding forward regression/classification problem. We also propose estimators for the sufficient reduction and show they are consistent and asymptotically normal. Obtaining the asymptotic results required leveraging manifolds as a basis for resolving the issue of unidentifiable parameters. This in turn led to an even more flexible modeling framework, which allows building complex and potentially problem-specific parameter spaces that incorporate additional domain-specific knowledge into the model.
 
-Our multi-linear Ising model can be thought of as the extension of the Ising model-based approach of  \textcite{ChengEtAl2014}, where a $q$-dimensional binary vector is regressed on a $p$-dimensional continuous vector. Yet, our model does not require penalization or sparsity assumptions, despite the tensor nature of the data, by leveraging the inherent structural information of the tensor-valued covariates assuming separable first and second moments. Moreover, it can accommodate a mixture of continuous and binary tensor-valued predictors, which is a subject of future work.  
+%A case in point is the popular one-parameter Ising model \parencite[e.g.][]{MukherjeeEtAl2022,NguyenEtAl2017} in Statistical Physics, which is parametrized via a single scaling factor of the covariance. Our model is capable of representing this specific structure via a linear Ising model where the parameters are represented by a one dimensional matrix manifold. Although this is interesting from a theoretical point of view, the design of our approach is \emph{not} intended for this kind of models.
 
-An additional powerful extension of our model involves considering a sum of separable Kronecker predictors. This is motivated by the equivalence of a Kronecker product to a rank 1 tensor. By allowing a sum of a few separable Kronecker predictors, we remove the implicit rank 1 constraint. However, if this extension is to be applied to the SDR setting, as in this paper, it is crucial to ensure that the sum of Kronecker products forms a parameter manifold to apply our theory. While we anticipate that this approach can lead to intriguing and powerful models, there are certain details that need to be resolved first.
+We allude to this feature of our approach in \cref{sec:matrix-manifolds}, where we also tabulate different matrix manifolds that can be used as building blocks $\manifold{B}_k$ and $\manifold{O}_k$ of the parameter space in \cref{tab:matrix-manifolds}. For example, our formulation can easily accommodate longitudinal data tabulated in matrix format, where the rows are covariates and the columns are consecutive time points with discrete AR($k$) dependence structure.   
 
-\todo{finish!}
+Our multi-linear Ising model can be thought of as the extension of the Ising model-based approach of  \textcite{ChengEtAl2014}, where a $q$-dimensional binary vector is regressed on a $p$-dimensional continuous vector. Yet, our model leverages the inherent structural information of the tensor-valued covariates by assuming separable first and second moments. By doing so, it bypasses requiring usual sparsity assumptions or penalization, despite the tensor high-dimensional nature of the data. Moreover, it can accommodate a mixture of continuous and binary tensor-valued predictors, which is the subject of future work.  
+
+
+%Another interesting future research is to better understand the surprising behavior of TSIR \parencite{DingCook2015} we discovered in \cref{sec:sim-tensor-normal}, especially as compared with our multi-linear normal model for different signal-to-noise ratios.
+
+An additional powerful extension of our model involves considering a sum of separable Kronecker predictors. This is motivated by the equivalence of a Kronecker product to a rank $1$ tensor. By allowing a sum of a few separable Kronecker predictors, we remove the implicit rank $1$ constraint. However, if this extension is to be applied to the SDR setting, as in this paper, it is crucial to ensure that the sum of Kronecker products form a parameter manifold.% to apply our theory. 
+%While we anticipate that this approach can lead to intriguing and powerful models, certain details need to be resolved first.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \printbibliography[heading=bibintoc, title={References}]
@@ -1219,31 +1176,6 @@ as well as for any tensor $\ten{A}$ of even order $2 r$ and matching square matr
     \t{(\vec{\ten{A}})}\vec\Bigl(\bigotimes_{k = r}^{1}\t{\mat{B}_k}\Bigr)
 \end{displaymath}
 
-
-% \begin{lemma}\label{thm:kron-perm}
-%     Given $r$ matrices $\mat{A}_k$ of dimension $p_j\times q_j$ for $k = 1, \ldots, r$, then there exists a unique permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ such that
-%     \begin{equation}\label{eq:kron-to-outer-perm}
-%         \vec\bigkron_{k = r}^{1}\mat{A}_j = \mat{S}_{\mat{p}, \mat{q}}\vec\bigouter_{k = 1}^{r}\mat{A}_k.
-%     \end{equation}
-%     The permutation $\mat{S}_{\mat{p}, \mat{q}}$ with indices $\mat{p} = (p_1, \ldots, p_r)$ and $\mat{q} = (q_1, \ldots, q_r)$ is the matrix-matrix product of $r - 1$ permutation matrices given by
-%     \begin{multline}\label{eq:S_pq}
-%         \mat{S}_{\mat{p}, \mat{q}} =
-%             \Bigl[ \mat{I}_1\otimes \Bigl( \mat{I}_{\prod_{k = r}^{2}q_k}\otimes\mat{K}_{q_1, \prod_{k = r}^{2}p_k}\otimes I_{p_1} \Bigr)\Bigr] \\
-%             \Bigl[ \mat{I}_{p_1 q_1}\otimes \Bigl( \mat{I}_{\prod_{k = r}^{3}q_k}\otimes\mat{K}_{q_2, \prod_{k = r}^{3}p_k}\otimes I_{p_2} \Bigr) \Bigr]
-%             \cdots
-%             \Bigl[ \mat{I}_{\prod_{k = 1}^{r - 2}p_k q_k}\otimes \Bigl( \mat{I}_{q_r}\otimes\mat{K}_{q_{r - 1}, p_r}\otimes I_{p_{r - 1}} \Bigr) \Bigr]
-%     \end{multline}
-%     where $\mat{K}_{p, q}$ is the \emph{commutation matrix} from \textcite[Ch.~11]{MatrixAlgebra-AbadirMagnus2005}, that is the permutation such that $\vec{\t{\mat{A}}} = \mat{K}_{p, q}\vec{\mat{A}}$ for every $p\times q$ dimensional matrix $\mat{A}$.
-% \end{lemma}
-% \begin{proof}
-%     \textcite[Lemma~7]{SymMatandJacobians-MagnusNeudecker1986} states that
-%     \begin{align*}
-%         \vec(\mat{A}_2\otimes\mat{A}_1)
-%             &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})(\vec{\mat{A}_2}\otimes\vec{\mat{A}_1}) \\
-%             &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})\vec(\mat{A}_1\circ \mat{A}_2).
-%     \end{align*}
-%     This proves the statement for $r = 2$. The general statement for $r > 2$ follows via induction using \textcite[Lemma~7]{SymMatandJacobians-MagnusNeudecker1986} in conjunction with $\vec(\mat{C}\mat{a}\t{\mat{b}}) = (\mat{I}_{\dim(\mat{b})}\otimes\mat{C})\vec(\mat{a}\t{\mat{b}})$.
-% \end{proof}
 \begin{lemma}\label{thm:kron-perm}
     Given $r \geq 2$ matrices $\mat{A}_k$ of dimension $p_j\times q_j$ for $k = 1, \ldots, r$, then there exists a unique permutation matrix $\mat{S}_{\mat{p}, \mat{q}}$ such that
     \begin{equation}\label{eq:kron-to-outer-perm}