diff --git a/LaTeX/main.bib b/LaTeX/main.bib
index 9493656..08a6478 100644
--- a/LaTeX/main.bib
+++ b/LaTeX/main.bib
@@ -1,41 +1,134 @@
-@article{KoldaBader2009,
-author = {Kolda, Tamara G. and Bader, Brett W.},
-title = {Tensor Decompositions and Applications},
-journal = {SIAM Review},
-volume = {51},
-number = {3},
-pages = {455-500},
-year = {2009},
-doi = {10.1137/07070111X},
-URL = { 
-https://doi.org/10.1137/07070111X},
-eprint ={https://doi.org/10.1137/07070111X},
-abstract = { This survey provides an overview of higher-order tensor decompositions, their applications, and available software. A tensor is a multidimensional or N-way array. Decompositions of higher-order tensors (i.e., N-way arrays with \$N \geq 3\$) have applications in psycho-metrics, chemometrics, signal processing, numerical linear algebra, computer vision, numerical analysis, data mining, neuroscience, graph analysis, and elsewhere. Two particular tensor decompositions can be considered to be higher-order extensions of the matrix singular value decomposition: CANDECOMP/PARAFAC (CP) decomposes a tensor as a sum of rank-one tensors, and the Tucker decomposition is a higher-order form of principal component analysis. There are many other tensor decompositions, including INDSCAL, PARAFAC2, CANDELINC, DEDICOM, and PARATUCK2 as well as nonnegative variants of all of the above. The N-way Toolbox, Tensor Toolbox, and Multilinear Engine are examples of software packages for working with tensors. }
+@book{AbadirMagnus2005,
+    title       = {Matrix Algebra},
+    author      = {Abadir, Karim M. and Magnus, Jan R.},
+    year        = {2005},
+    publisher   = {Cambridge University Press},
+    series      = {Econometric Exercises},
+    collection  = {Econometric Exercises},
+    place       = {Cambridge},
+    doi         = {10.1017/CBO9780511810800}
 }
 
-@article{RegMatrixReg-ZhouLi2014,
-    author      = {Zhou, Hua and Li, Lexin},
-    title       = {Regularized matrix regression},
-    journal     = {Journal of the Royal Statistical Society. Series B (Statistical Methodology)},
-    volume      = {76},
+@book{AbsilEtAt2007,
+    title       = {{Optimization Algorithms on Matrix Manifolds}},
+    author      = {Absil, P.-A. and Mahony, R. and Sepulchre, R.},
+    year        = {2007},
+    publisher   = {Princeton University Press},
+    isbn        = {9780691132983},
+    note        = {Full Online Text \url{https://press.princeton.edu/absil}}
+}
+
+@book{Arnold1981,
+    title       = {The theory of linear models and multivariate analysis},
+    author      = {Arnold, Steven F},
+    year        = {1981},
+    series      = {Wiley series in probability and mathematical statistics : Probability and mathematical statistics},
+    publisher   = {Wiley},
+    isbn        = {0471050652},
+    language    = {eng},
+    address     = {New York, NY [u.a.]},
+    keywords    = {Multivariate Analyse}
+}
+
+@article{BasserPajevic2000,
+    title       = {Statistical artifacts in diffusion tensor MRI (DT-MRI) caused by background noise},
+    author      = {Basser, Peter J. and Pajevic, Sinisa},
+    year        = {2000},
+    journal     = {Magnetic Resonance in Medicine},
+    volume      = {44},
+    number      = {1},
+    pages       = {41-50},
+    doi         = {10.1002/1522-2594(200007)44:1<41::AID-MRM8>3.0.CO;2-O},
+    url         = {https://onlinelibrary.wiley.com/doi/abs/10.1002/1522-2594%28200007%2944%3A1%3C41%3A%3AAID-MRM8%3E3.0.CO%3B2-O}
+}
+
+@article{BasserPajevic2003,
+    title       = {A normal distribution for tensor-valued random variables: applications to diffusion tensor MRI},
+    author      = {Basser, Peter J and Pajevic, Sinisa},
+    journal     = {IEEE transactions on medical imaging},
+    volume      = {22},
+    number      = {7},
+    pages       = {785--794},
+    year        = {2003},
+    publisher   = {IEEE}
+}
+
+@article{BasserPajevic2007,
+    title       = {Spectral decomposition of a 4th-order covariance tensor: Applications to diffusion tensor MRI},
+    journal     = {Signal Processing},
+    volume      = {87},
     number      = {2},
-    pages       = {463--483},
-    year        = {2014},
-    publisher   = {[Royal Statistical Society, Wiley]}
+    pages       = {220-236},
+    year        = {2007},
+    note        = {Tensor Signal Processing},
+    issn        = {0165-1684},
+    doi         = {https://doi.org/10.1016/j.sigpro.2006.02.050},
+    url         = {https://www.sciencedirect.com/science/article/pii/S0165168406001678},
+    author      = {Peter J. Basser and Sinisa Pajevic}
 }
 
-@inproceedings{Nesterov1983,
-    title       = {A method of solving a convex programming problem with convergence rate $O(1/k^2)$},
-    author      = {Nesterov, Yurii Evgen'evich},
-    booktitle   = {Doklady Akademii Nauk},
-    volume      = {269},
-    number      = {3},
-    pages       = {543--547},
-    year        = {1983},
-    organization= {Russian Academy of Sciences}
+@article{BrownEtAt2001,
+    title       = {Bayesian discrimination with longitudinal data},
+    author      = {Brown, P. J. and Kenward, M. G. and Bassett, E. E.},
+    year        = {2001},
+    journal     = {Biostatistics},
+    volume      = {2},
+    number      = {4},
+    pages       = {417-432},
+    month       = {12},
+    issn        = {1465-4644},
+    doi         = {10.1093/biostatistics/2.4.417}
 }
 
-@book{StatInf-CasellaBerger2002,
+@article{BuraDuarteForzani2016,
+    author      = {Efstathia Bura and Sabrina Duarte and Liliana Forzani},
+    title       = {Sufficient Reductions in Regressions With Exponential Family Inverse Predictors},
+    journal     = {Journal of the American Statistical Association},
+    volume      = {111},
+    number      = {515},
+    pages       = {1313-1329},
+    year        = {2016},
+    publisher   = {Taylor \& Francis},
+    doi         = {10.1080/01621459.2015.1093944}
+}
+
+@article{BuraEtAt2018,
+    author      = {Bura, Efstathia and Duarte, Sabrina and Forzani, Liliana and E. Smucler and M. Sued},
+    title       = {Asymptotic theory for maximum likelihood estimates in reduced-rank multivariate generalized linear models},
+    journal     = {Statistics},
+    volume      = {52},
+    number      = {5},
+    pages       = {1005-1024},
+    year        = {2018},
+    publisher   = {Taylor \& Francis},
+    doi         = {10.1080/02331888.2018.1467420},
+}
+
+@article{BuraForzaniEtAt2022,
+    author  = {Efstathia Bura and Liliana Forzani and Rodrigo Garcia Arancibia and Pamela Llop and Diego Tomassi},
+    title   = {Sufficient reductions in regression with mixed predictors},
+    journal = {Journal of Machine Learning Research},
+    year    = {2022},
+    volume  = {23},
+    number  = {102},
+    pages   = {1--47},
+    url     = {http://jmlr.org/papers/v23/21-0175.html}
+}
+
+@article{Burdick1995,
+    title       = {An introduction to tensor products with applications to multiway data analysis},
+    journal     = {Chemometrics and Intelligent Laboratory Systems},
+    volume      = {28},
+    number      = {2},
+    pages       = {229-237},
+    year        = {1995},
+    issn        = {0169-7439},
+    doi         = {10.1016/0169-7439(95)80060-M},
+    url         = {https://www.sciencedirect.com/science/article/pii/016974399580060M},
+    author      = {Donald S. Burdick}
+}
+
+@book{CasellaBerger2002,
     title       = {{Statistical Inference}},
     author      = {Casella, George and Berger, Roger L.},
     isbn        = {0-534-24312-6},
@@ -45,15 +138,575 @@ abstract = { This survey provides an overview of higher-order tensor decompositi
     publisher   = {Thomson Learning}
 }
 
-@book{MatrixDiffCalc-MagnusNeudecker1999,
-    title       = {Matrix Differential Calculus with Applications in Statistics and Econometrics (Revised Edition)},
-    author      = {Magnus, Jan R. and Neudecker, Heinz},
-    year        = {1999},
-    publisher   = {John Wiley \& Sons Ltd},
-    isbn        = {0-471-98632-1}
+@article{ChenEtAt2021,
+    author      = {Chen, You-Lin and Kolar, Mladen and Tsay, Ruey S.},
+    title       = {Tensor Canonical Correlation Analysis With Convergence and Statistical Guarantees},
+    year        = {2021},
+    journal     = {Journal of Computational and Graphical Statistics},
+    volume      = {30},
+    number      = {3},
+    pages       = {728--744},
+    doi         = {10.1080/10618600.2020.1856118}
 }
 
-@article{SymMatandJacobians-MagnusNeudecker1986,
+@article{ChengEtAt2014,
+    author        = {Cheng, Jie and Levina, Elizaveta and Wang, Pei and Zhu, Ji},
+    title         = {A sparse Ising model with covariates},
+    journal       = {Biometrics},
+    volume        = {70},
+    number        = {4},
+    year          = {2014},
+    pages         = {943-953},
+    doi           = {10.1111/biom.12202}
+}
+
+@incollection{Comon2002,
+    author      = {Comon, Pierre},
+    isbn        = {9780198507345},
+    title       = {{Tensor Decompositions: State of the Art and Applications}},
+    booktitle   = {{Mathematics in Signal Processing V}},
+    publisher   = {Oxford University Press},
+    year        = {2002},
+    month       = {06},
+    doi         = {10.1093/oso/9780198507345.003.0001},
+    eprint      = {https://academic.oup.com/book/0/chapter/422056726/chapter-pdf/52392862/isbn-9780198507345-book-part-1.pdf}
+}
+
+@INPROCEEDINGS{Comon2009,
+    author      = {Comon, Pierre},
+    booktitle   = {2009 IEEE/SP 15th Workshop on Statistical Signal Processing}, 
+    title       = {Tensors versus matrices usefulness and unexpected properties}, 
+    year        = {2009},
+    volume      = {},
+    number      = {},
+    pages       = {781-788},
+    doi         = {10.1109/SSP.2009.5278471}
+}
+
+@article{Cook2007,
+    author      = {Cook, R. Dennis},
+    journal     = {Statistical Science},
+    month       = {02},
+    number      = {1},
+    pages       = {1--26},
+    publisher   = {The Institute of Mathematical Statistics},
+    title       = {{Fisher Lecture: Dimension Reduction in Regression}},
+    volume      = {22},
+    year        = {2007},
+    doi         = {10.1214/088342306000000682}
+}
+
+@article{Dai2012,
+    author      = {B. Dai},
+    title       = {Multivariate bernoulli distribution models},
+    year        = {2012}
+}
+
+@article{DaiDingWahba2013,
+    author      = {B. Dai and S. Ding and G. Wahba},
+    title       = {Multivariate bernoulli distribution},
+    year        = {2013}
+}
+
+@article{Dawid1981,
+    ISSN        = {00063444},
+    URL         = {http://www.jstor.org/stable/2335827},
+    abstract    = {We introduce and justify a convenient notation for certain matrix-variate distributions which, by its emphasis on the important underlying parameters, and the theory on which it is based, eases greatly the task of manipulating such distributions. Important examples include the matrix-variate normal, t, F and beta, and the Wishart and inverse Wishart distributions. The theory is applied to compound matrix distributions and to Bayesian prediction in the multivariate linear model.},
+    author      = {A. P. Dawid},
+    journal     = {Biometrika},
+    number      = {1},
+    pages       = {265--274},
+    publisher   = {[Oxford University Press, Biometrika Trust]},
+    title       = {Some Matrix-Variate Distribution Theory: Notational Considerations and a Bayesian Application},
+    urldate     = {2024-01-12},
+    volume      = {68},
+    year        = {1981}
+}
+
+@article{DeAlmeidaEtAt2007,
+    title       = {PARAFAC-based unified tensor modeling for wireless communication systems with application to blind multiuser equalization},
+    journal     = {Signal Processing},
+    volume      = {87},
+    number      = {2},
+    pages       = {337-351},
+    year        = {2007},
+    note        = {Tensor Signal Processing},
+    issn        = {0165-1684},
+    doi         = {https://doi.org/10.1016/j.sigpro.2005.12.014},
+    url         = {https://www.sciencedirect.com/science/article/pii/S0165168406001757},
+    author      = {André L.F. {de Almeida} and Gérard Favier and João Cesar M. Mota}
+}
+
+@article{DeesMandic2019,
+    title       = {A Statistically Identifiable Model for Tensor-Valued Gaussian Random Variables},
+    author      = {Bruno Scalzo Dees and Danilo P. Mandic},
+    journal     = {ArXiv},
+    year        = {2019},
+    volume      = {abs/1911.02915},
+    url         = {https://api.semanticscholar.org/CorpusID:207847615}
+}
+
+@article{DeLathauwerCastaing2007,
+    title       = {Tensor-based techniques for the blind separation of DS-CDMA signals},
+    journal     = {Signal Processing},
+    volume      = {87},
+    number      = {2},
+    pages       = {322-336},
+    year        = {2007},
+    note        = {Tensor Signal Processing},
+    issn        = {0165-1684},
+    doi         = {10.1016/j.sigpro.2005.12.015},
+    url         = {https://www.sciencedirect.com/science/article/pii/S0165168406001745},
+    author      = {Lieven {De Lathauwer} and Joséphine Castaing}
+}
+
+@article{DingCook2015,
+    author      = {Shanshan Ding and R. Dennis Cook},
+    title       = {Tensor sliced inverse regression},
+    journal     = {Journal of Multivariate Analysis},
+    volume      = {133},
+    pages       = {216-231},
+    year        = {2015},
+    issn        = {0047-259X},
+    doi         = {10.1016/j.jmva.2014.08.015}
+}
+
+@article{DrtonEtAt2020,
+    title       = {Existence and uniqueness of the Kronecker covariance MLE},
+    author      = {Mathias Drton and Satoshi Kuriki and Peter D. Hoff},
+    journal     = {The Annals of Statistics},
+    year        = {2020},
+    url         = {https://api.semanticscholar.org/CorpusID:212718000}
+}
+
+@article{DrydenEtAt2009,
+    author      = {Ian L. Dryden and Alexey Koloydenko and Diwei Zhou},
+    title       = {{Non-Euclidean statistics for covariance matrices, with applications to diffusion tensor imaging}},
+    volume      = {3},
+    journal     = {The Annals of Applied Statistics},
+    number      = {3},
+    publisher   = {Institute of Mathematical Statistics},
+    pages       = {1102 -- 1123},
+    year        = {2009},
+    doi         = {10.1214/09-AOAS249}
+}
+
+@misc{Dutilleul1990,
+    title       = {Apport en analyse spectrale d'un p\'eriodogramme modifi\'e et mod\'elisation des s\'eries chronologiques avec r\'ep\'etitions en vue de leur comparaison en fr\'equence},
+    author      = {Pierre Dutilleul},
+    year        = {1990},
+    note        = {Unpublished D.Sc. Dissertation},
+    address     = {Department of Mathematics. Universit\'e catholique de Louvian, Louvain-la-Neuve, Belgium}
+}
+
+@article{Dutilleul1999,
+    author      = {Pierre Dutilleul},
+    title       = {The mle algorithm for the matrix normal distribution},
+    journal     = {Journal of Statistical Computation and Simulation},
+    volume      = {64},
+    number      = {2},
+    pages       = {105-123},
+    year        = {1999},
+    publisher   = {Taylor & Francis},
+    doi         = {10.1080/00949659908811970}
+}
+
+@article{Einstain1916,
+    author      = {Einstein, Albert},
+    title       = {Die Grundlage der allgemeinen Relativitätstheorie},
+    year        = {1916},
+    journal     = {Annalen der Physik},
+    volume      = {354},
+    number      = {7},
+    pages       = {769-822},
+    doi         = {10.1002/andp.19163540702}
+}
+
+
+@article{GirkaEtAt2024,
+    title       = {Tensor generalized canonical correlation analysis},
+    author      = {Fabien Girka and Arnaud Gloaguen and Laurent {Le Brusquet} and Violetta Zujovic and Arthur Tenenhaus},
+    year        = {2024},
+    journal     = {Information Fusion},
+    volume      = {102},
+    issn        = {1566-2535},
+    doi         = {10.1016/j.inffus.2023.102045}
+}
+
+@book{GoodfellowEtAt2016,
+    title       = {Deep Learning},
+    author      = {Ian Goodfellow and Yoshua Bengio and Aaron Courville},
+    publisher   = {MIT Press},
+    url         = {\url{http://www.deeplearningbook.org}},
+    year        = {2016}
+}
+
+@article{GreenewaldHero2014,
+    title       = {Robust Kronecker Product PCA for Spatio-Temporal Covariance Estimation},
+    author      = {Kristjan H. Greenewald and Alfred O. Hero},
+    journal     = {IEEE Transactions on Signal Processing},
+    year        = {2014},
+    volume      = {63},
+    pages       = {6368-6378},
+    url         = {https://api.semanticscholar.org/CorpusID:15582097}
+}
+
+@misc{HajriEtAt2017,
+    title       = {Maximum Likelihood Estimators on Manifolds},
+    author      = {Hajri, Hatem and Said, Salem and Berthoumieu, Yannick},
+    year        = {2017},
+    journal     = {Lecture Notes in Computer Science},
+    publisher   = {Springer International Publishing},
+    pages       = {692-700},
+    doi         = {10.1007/978-3-319-68445-1_80}
+}
+
+@book{Harville1997,
+    title       = {Matrix Algebra From a Statistician's Perspective},
+    edition     = {1},
+    author      = {David A. Harville},
+    year        = {1997},
+    publisher   = {Springer-Verlag},
+    address     = {New York},
+    chapter     = {15}
+}
+
+@article{HillarLim2013,
+    author      = {Hillar, Christopher J. and Lim, Lek\-Heng},
+    title       = {Most Tensor Problems Are NP-Hard},
+    year        = {2013},
+    issue_date  = {November 2013},
+    publisher   = {Association for Computing Machinery},
+    address     = {New York, NY, USA},
+    volume      = {60},
+    number      = {6},
+    issn        = {0004-5411},
+    url         = {https://doi.org/10.1145/2512329},
+    doi         = {10.1145/2512329},
+    journal     = {J. ACM},
+    month       = {nov},
+    articleno   = {45},
+    numpages    = {39}
+}
+
+@misc{Hinton2012,
+    title         = {Neural networks for machine learning},
+    author        = {Hinton, G.},
+    year          = {2012}
+}
+
+@article{Hoff2011,
+    author      = {Peter D. Hoff},
+    title       = {{Separable covariance arrays via the Tucker product, with applications to multivariate relational data}},
+    volume      = {6},
+    journal     = {Bayesian Analysis},
+    number      = {2},
+    publisher   = {International Society for Bayesian Analysis},
+    pages       = {179 -- 196},
+    keywords    = {Gaussian, matrix normal, multiway data, network, tensor, Tucker decomposition},
+    year        = {2011},
+    doi         = {10.1214/11-BA606}
+}
+
+@article{Hoff2015,
+    author      = {Peter D. Hoff},
+    title       = {{Multilinear tensor regression for longitudinal relational data}},
+    volume      = {9},
+    journal     = {The Annals of Applied Statistics},
+    number      = {3},
+    publisher   = {Institute of Mathematical Statistics},
+    pages       = {1169 -- 1193},
+    keywords    = {Array normal, Bayesian inference, event data, international relations, network, Tucker product, vector autoregression},
+    year        = {2015},
+    doi         = {10.1214/15-AOAS839}
+}
+
+@article{HuLeeWang2022,
+    author      = {Hu, Jiaxin and Lee, Chanwoo and Wang, Miaoyan},
+    title       = {Generalized Tensor Decomposition With Features on Multiple Modes},
+    journal     = {Journal of Computational and Graphical Statistics},
+    volume      = {31},
+    number      = {1},
+    pages       = {204-218},
+    year        = {2022},
+    publisher   = {Taylor \& Francis},
+    doi         = {10.1080/10618600.2021.1978471},
+}
+
+@article{Ising1924,
+    author      = {Ising, Ernst},
+    title       = {{Beitrag zur Theorie des Ferromagnetismus}},
+    journal     = {Zeitschrift f\"ur Physik},
+    pages       = {253-258},
+    volume      = {31},
+    number      = {1},
+    year        = {1924},
+    month       = {2},
+    issn        = {0044-3328},
+    doi         = {10.1007/BF02980577}
+}
+
+@article{JungEtAt2019,
+    author      = {Sungkyu Jung and Jeongyoun Ahn and Yongho Jeon},
+    title       = {Penalized Orthogonal Iteration for Sparse Estimation of Generalized Eigenvalue Problem},
+    journal     = {Journal of Computational and Graphical Statistics},
+    volume      = {28},
+    number      = {3},
+    pages       = {710-721},
+    year        = {2019},
+    publisher   = {Taylor & Francis},
+    doi         = {10.1080/10618600.2019.1568014}
+}
+
+@book{Kaltenbaeck2021,
+    title       = {Aufbau Analysis},
+    author      = {Kaltenb\"ack, Michael},
+    isbn        = {978-3-88538-127-3},
+    series      = {Berliner Studienreihe zur Mathematik},
+    edition     = {27},
+    year        = {2021},
+    publisher   = {Heldermann Verlag}
+}
+
+@article{Kapla2019,
+    title       = {Comparison of Different Word Embeddings and Neural Network Types for Sentiment Analysis of German Political Speeches},
+    author      = {Kapla, Daniel},
+    year        = {2019}
+}
+
+@inproceedings{KofidisRegalia2005,
+    title       = {Tensor Approximation and Signal Processing Applications},
+    author      = {Eleftherios Kofidis and Phillip A. Regalia},
+    year        = {2005},
+    url         = {https://api.semanticscholar.org/CorpusID:13667742}
+}
+
+@article{Kolda2006,
+    title       = {Multilinear operators for higher-order decompositions.},
+    author      = {Kolda, Tamara Gibson},
+    doi         = {10.2172/923081},
+    url         = {https://www.osti.gov/biblio/923081},
+    place       = {United States},
+    year        = {2006},
+    month       = {4},
+    type        = {Technical Report}
+}
+
+@article{KoldaBader2009,
+    author      = {Kolda, Tamara G. and Bader, Brett W.},
+    title       = {Tensor Decompositions and Applications},
+    journal     = {SIAM Review},
+    volume      = {51},
+    number      = {3},
+    pages       = {455-500},
+    year        = {2009},
+    doi         = {10.1137/07070111X}
+}
+
+@book{KolloVonRosen2005,
+    title       = {Advanced Multivariate Statistics with Matrices},
+    isbn        = {978-1-4020-3419-0},
+    doi         = {10.1007/1-4020-3419-9},
+    publisher   = {Springer Dordrecht},
+    author      = {Kollo,  T\~onu and von Rosen,  Dietrich},
+    editor      = {Hazewinkel,  M.},
+    year        = {2005}
+}
+
+@book{Kroonenberg2008,
+    title      = {Applied Multiway Data Analysis},
+    author     = {Kroonenberg, Pieter M.},
+    year       = {2008},
+    publisher  = {John Wiley \& Sons, Ltd},
+    address    = {New York},
+    isbn       = {9780470238004},
+    doi        = {10.1002/9780470238004},
+}
+
+@book{Kusolitsch2011,
+    title       = {{M}a\ss{}- und {W}ahrscheinlichkeitstheorie},
+    subtitle    = {{E}ine {E}inf{\"u}hrung},
+    author      = {Kusolitsch, Norbert},
+    series      = {Springer-Lehrbuch},
+    year        = {2011},
+    publisher   = {Springer Vienna},
+    isbn        = {978-3-7091-0684-6},
+    doi         = {10.1007/978-3-7091-0685-3}
+}
+
+@article{LandgrafLee2020,
+    title       = {Dimensionality reduction for binary data through the projection of natural parameters},
+    journal     = {Journal of Multivariate Analysis},
+    volume      = {180},
+    pages       = {104668},
+    year        = {2020},
+    issn        = {0047-259X},
+    doi         = {10.1016/j.jmva.2020.104668},
+    author      = {Andrew J. Landgraf and Yoonkyung Lee}
+}
+
+@article{LeBihanEtAt2001,
+    title       = {Diffusion tensor imaging: Concepts and applications},
+    author      = {Le Bihan, Denis and Mangin, Jean-François and Poupon, Cyril and Clark, Chris A. and Pappata, Sabina and Molko, Nicolas and Chabriat, Hughes},
+    year        = {2001},
+    journal     = {Journal of Magnetic Resonance Imaging},
+    volume      = {13},
+    number      = {4},
+    pages       = {534-546},
+    doi         = {https://doi.org/10.1002/jmri.1076},
+    url         = {https://onlinelibrary.wiley.com/doi/abs/10.1002/jmri.1076}
+}
+
+@book{Lee2012,
+    title       = {Introduction to Smooth Manifolds},
+    author      = {Lee, John M.},
+    year        = {2012},
+    journal     = {Graduate Texts in Mathematics},
+    publisher   = {Springer New York},
+    doi         = {10.1007/978-1-4419-9982-5}
+}
+
+@book{Lee2018,
+    title       = {Introduction to Riemannian Manifolds},
+    author      = {Lee, John M.},
+    year        = {2018},
+    journal     = {Graduate Texts in Mathematics},
+    publisher   = {Springer International Publishing},
+    doi         = {10.1007/978-3-319-91755-9}
+}
+
+@article{LengPan2018,
+    author      = {Leng, Chenlei and Pan, Guangming},
+    title       = {{Covariance estimation via sparse Kronecker structures}},
+    volume      = {24},
+    journal     = {Bernoulli},
+    number      = {4B},
+    publisher   = {Bernoulli Society for Mathematical Statistics and Probability},
+    pages       = {3833 -- 3863},
+    year        = {2018},
+    doi         = {10.3150/17-BEJ980}
+}
+
+@article{LeporeEtAt2008,
+    author      = {Lepore, Natasha and Brun, Caroline and Chou, Yi-Yu and Chiang, Ming-Chang and Dutton, Rebecca A. and Hayashi, Kiralee M. and Luders, Eileen and Lopez, Oscar L. and Aizenstein, Howard J. and Toga, Arthur W. and Becker, James T. and Thompson, Paul M.},
+    journal     = {IEEE Transactions on Medical Imaging},
+    title       = {Generalized Tensor-Based Morphometry of HIV/AIDS Using Multivariate Statistics on Deformation Tensors},
+    year        = {2008},
+    volume      = {27},
+    number      = {1},
+    pages       = {129-141},
+    doi         = {10.1109/TMI.2007.906091}
+}
+
+@article{LeurgansRoss1992,
+    title       = {{Multilinear Models: Applications in Spectroscopy}},
+    author      = {Sue Leurgans and Robert T. Ross},
+    year        = {1992},
+    journal     = {Statistical Science},
+    volume      = {7},
+    number      = {3},
+    publisher   = {Institute of Mathematical Statistics},
+    pages       = {289 -- 310},
+    keywords    = {Multi-mode factor analysis, nonlinear least-squares, PARAFAC, three-way arrays},
+    doi         = {10.1214/ss/1177011225}
+}
+
+@article{LengPan2018,
+    author      = {Leng, Chenlei and Pan, Guangming},
+    title       = {{Covariance estimation via sparse Kronecker structures}},
+    volume      = {24},
+    journal     = {Bernoulli},
+    number      = {4B},
+    publisher   = {Bernoulli Society for Mathematical Statistics and Probability},
+    pages       = {3833 -- 3863},
+    year        = {2018},
+    doi         = {10.3150/17-BEJ980}
+}
+
+@article{LeporeEtAt2008,
+    author      = {Lepore, Natasha and Brun, Caroline and Chou, Yi-Yu and Chiang, Ming-Chang and Dutton, Rebecca A. and Hayashi, Kiralee M. and Luders, Eileen and Lopez, Oscar L. and Aizenstein, Howard J. and Toga, Arthur W. and Becker, James T. and Thompson, Paul M.},
+    journal     = {IEEE Transactions on Medical Imaging},
+    title       = {Generalized Tensor-Based Morphometry of HIV/AIDS Using Multivariate Statistics on Deformation Tensors},
+    year        = {2008},
+    volume      = {27},
+    number      = {1},
+    pages       = {129-141},
+    doi         = {10.1109/TMI.2007.906091}
+}
+
+@article{LeurgansRoss1992,
+    title       = {{Multilinear Models: Applications in Spectroscopy}},
+    author      = {Sue Leurgans and Robert T. Ross},
+    year        = {1992},
+    journal     = {Statistical Science},
+    volume      = {7},
+    number      = {3},
+    publisher   = {Institute of Mathematical Statistics},
+    pages       = {289 -- 310},
+    keywords    = {Multi-mode factor analysis, nonlinear least-squares, PARAFAC, three-way arrays},
+    doi         = {10.1214/ss/1177011225}
+}
+
+@article{Li1991,
+    title       = {Sliced Inverse Regression for Dimension Reduction},
+    author      = {Li, K. C.},
+    journal     = {Journal of the American Statistical Association},
+    volume      = {86},
+    number      = {414},
+    pages       = {316-327},
+    year        = {1991},
+}
+
+@article{Li1992,
+    author      = {Ker-Chau Li},
+    title       = {On Principal Hessian Directions for Data Visualization and Dimension Reduction: Another Application of Stein's Lemma},
+    journal     = {Journal of the American Statistical Association},
+    volume      = {87},
+    number      = {420},
+    pages       = {1025-1039},
+    year        = {1992},
+    publisher   = {Taylor \& Francis},
+    doi         = {10.1080/01621459.1992.10476258},
+}
+
+@article{LiKimAltman2010,
+    author      = {Bing Li and Min Kyung Kim and Naomi Altman},
+    title       = {{On dimension folding of matrix- or array-valued statistical objects}},
+    volume      = {38},
+    journal     = {The Annals of Statistics},
+    number      = {2},
+    publisher   = {Institute of Mathematical Statistics},
+    pages       = {1094 -- 1121},
+    keywords    = {directional regression, electroencephalography, Kronecker envelope, sliced average variance estimate, sliced inverse regression},
+    year        = {2010},
+    doi         = {10.1214/09-AOS737}
+}
+
+# TODO: check this!
+@inbook{LiuKoike2007,
+    title       = {Extending Multivariate Space-Time Geostatistics for Environmental Data Analysis},
+    author      = {Chunxue Liu and Katsuaki Koike},
+    year        = {2007},
+    journal     = {Mathematical Geology},
+    publisher   = {International Association for Mathematical Geology},
+    pages       = {289--305},
+    doi         = {10.1007/s11004-007-9085-9}
+}
+
+@article{LuZimmerman2005,
+    title       = {The likelihood ratio test for a separable covariance matrix},
+    author      = {Nelson Lu and Dale L. Zimmerman},
+    year        = {2005},
+    journal     = {Statistics \& Probability Letters},
+    volume      = {73},
+    number      = {4},
+    pages       = {449-457},
+    issn        = {0167-7152},
+    doi         = {10.1016/j.spl.2005.04.020},
+    url         = {https://www.sciencedirect.com/science/article/pii/S0167715205001495}
+}
+
+@article{MagnusNeudecker1986,
     title       = {Symmetry, 0-1 Matrices and Jacobians: A Review},
     author      = {Magnus, Jan R. and Neudecker, Heinz},
     ISSN        = {02664666, 14694360},
@@ -67,99 +720,82 @@ abstract = { This survey provides an overview of higher-order tensor decompositi
     year        = {1986}
 }
 
-@book{MatrixAlgebra-AbadirMagnus2005,
-    title       = {Matrix Algebra},
-    author      = {Abadir, Karim M. and Magnus, Jan R.},
-    year        = {2005},
-    publisher   = {Cambridge University Press},
-    series      = {Econometric Exercises},
-    collection  = {Econometric Exercises},
-    place       = {Cambridge},
-    doi         = {10.1017/CBO9780511810800}
+@book{MagnusNeudecker1999,
+    title       = {Matrix Differential Calculus with Applications in Statistics and Econometrics (Revised Edition)},
+    author      = {Magnus, Jan R. and Neudecker, Heinz},
+    year        = {1999},
+    publisher   = {John Wiley \& Sons Ltd},
+    isbn        = {0-471-98632-1}
 }
 
-@article{TensorDecomp-HuLeeWang2022,
-    author      = {Hu, Jiaxin and Lee, Chanwoo and Wang, Miaoyan},
-    title       = {Generalized Tensor Decomposition With Features on Multiple Modes},
-    journal     = {Journal of Computational and Graphical Statistics},
-    volume      = {31},
-    number      = {1},
-    pages       = {204-218},
-    year        = {2022},
-    publisher   = {Taylor \& Francis},
-    doi         = {10.1080/10618600.2021.1978471},
+@article{ManceurDutilleul2013,
+    title       = {Maximum likelihood estimation for the tensor normal distribution: Algorithm, minimum sample size, and empirical bias and dispersion},
+    author      = {Ameur M. Manceur and Pierre Dutilleul},
+    journal     = {Journal of Computational and Applied Mathematics},
+    volume      = {239},
+    pages       = {37-49},
+    year        = {2013},
+    issn        = {0377-0427},
+    doi         = {10.1016/j.cam.2012.09.017},
+    url         = {https://www.sciencedirect.com/science/article/pii/S0377042712003810}
 }
 
-@article{CovarEstSparseKron-LengPan2018,
-    author      = {Leng, Chenlei and Pan, Guangming},
-    title       = {{Covariance estimation via sparse Kronecker structures}},
-    volume      = {24},
-    journal     = {Bernoulli},
-    number      = {4B},
-    publisher   = {Bernoulli Society for Mathematical Statistics and Probability},
-    pages       = {3833 -- 3863},
-    year        = {2018},
-    doi         = {10.3150/17-BEJ980}
+@article{MardiaGoodall1993,
+    title       = {Spatial-Temporal Analysis of Multivariate Environmental Monitoring Data},
+    author      = {Mardia, Kanti and Goodall, Colin},
+    year        = {1993},
+    month       = {01},
+    pages       = {},
+    volume      = {6},
+    publisher   = {Elsevier Science Publisher B.V.}
 }
 
-@article{sdr-PfeifferKaplaBura2021,
-    author      = {Pfeiffer, Ruth and Kapla, Daniel and Bura, Efstathia},
-    title       = {{Least squares and maximum likelihood estimation of sufficient reductions in regressions with matrix-valued predictors}},
-    volume      = {11},
-    year        = {2021},
-    journal     = {International Journal of Data Science and Analytics},
-    doi         = {10.1007/s41060-020-00228-y}
+@InProceedings{MartinFernandez2004,
+    author      = {Mart{\'i}n-Fern{\'a}ndez, Marcos and Westin, Carl-Fredrik and Alberola-L{\'o}pez, Carlos},
+    editor      = {Barillot, Christian and Haynor, David R. and Hellier, Pierre},
+    title       = {3D Bayesian Regularization of Diffusion Tensor MRI Using Multivariate Gaussian Markov Random Fields},
+    booktitle   = {Medical Image Computing and Computer-Assisted Intervention -- MICCAI 2004},
+    year        = {2004},
+    publisher   = {Springer Berlin Heidelberg},
+    address     = {Berlin, Heidelberg},
+    pages       = {351--359},
+    abstract    = {3D Bayesian regularization applied to diffusion tensor MRI is presented here. The approach uses Markov Random Field ideas and is based upon the definition of a 3D neighborhood system in which the spatial interactions of the tensors are modeled. As for the prior, we model the behavior of the tensor fields by means of a 6D multivariate Gaussian local characteristic. As for the likelihood, we model the noise process by means of conditionally independent 6D multivariate Gaussian variables. Those models include inter-tensor correlations, intra-tensor correlations and colored noise. The solution tensor field is obtained by using the simulated annealing algorithm to achieve the maximum a posteriori estimation. Several experiments both on synthetic and real data are presented, and performance is assessed with mean square error measure.},
+    isbn        = {978-3-540-30135-6}
 }
 
-@article{sdr-BuraDuarteForzani2016,
-    author      = {Bura, Efstathia and Duarte, Sabrina and Forzani, Liliana},
-    title       = {Sufficient Reductions in Regressions With Exponential Family Inverse Predictors},
-    journal     = {Journal of the American Statistical Association},
-    volume      = {111},
-    number      = {515},
-    pages       = {1313-1329},
-    year        = {2016},
-    publisher   = {Taylor \& Francis},
-    doi         = {10.1080/01621459.2015.1093944}
+@book{McCullagh1987,
+    title       = {Tensor Methods in Statistics},
+    subtitle    = {Monographs on Statistics and Applied Probability},
+    author      = {McCullagh, Peter},
+    year        = {1987},
+    publisher   = {Chapman and Hall/CRC},
+    doi         = {10.1201/9781351077118}
 }
 
-@article{FisherLectures-Cook2007,
-    author    = {Cook, R. Dennis},
-    journal   = {Statistical Science},
-    month     = {02},
-    number    = {1},
-    pages     = {1--26},
-    publisher = {The Institute of Mathematical Statistics},
-    title     = {{Fisher Lecture: Dimension Reduction in Regression}},
-    volume    = {22},
-    year      = {2007},
-    doi       = {10.1214/088342306000000682}
+@inproceedings{Nesterov1983,
+    title       = {A method of solving a convex programming problem with convergence rate $O(1/k^2)$},
+    author      = {Nesterov, Yurii Evgen'evich},
+    booktitle   = {Doklady Akademii Nauk},
+    volume      = {269},
+    number      = {3},
+    pages       = {543--547},
+    year        = {1983},
+    organization= {Russian Academy of Sciences}
 }
 
-@article{asymptoticMLE-BuraEtAl2018,
-    author      = {Bura, Efstathia and Duarte, Sabrina and Forzani, Liliana and E. Smucler and M. Sued},
-    title       = {Asymptotic theory for maximum likelihood estimates in reduced-rank multivariate generalized linear models},
-    journal     = {Statistics},
-    volume      = {52},
-    number      = {5},
-    pages       = {1005-1024},
-    year        = {2018},
-    publisher   = {Taylor \& Francis},
-    doi         = {10.1080/02331888.2018.1467420},
-}
-
-@article{tsir-DingCook2015,
-    author      = {Shanshan Ding and R. Dennis Cook},
-    title       = {Tensor sliced inverse regression},
+@article{OhlsonEtAt2013,
+    title       = {The multilinear normal distribution: Introduction and some basic properties},
+    author      = {Ohlson, Martin and Ahmad, Mumtaz Rauf and von Rosen, Dietrich},
+    year        = {2013},
     journal     = {Journal of Multivariate Analysis},
-    volume      = {133},
-    pages       = {216-231},
-    year        = {2015},
+    volume      = {113},
+    pages       = {37-47},
     issn        = {0047-259X},
-    doi         = {10.1016/j.jmva.2014.08.015}
+    doi         = {10.1016/j.jmva.2011.05.015},
+    url         = {https://www.sciencedirect.com/science/article/pii/S0047259X11001047}
 }
 
-@article{lsir-PfeifferForzaniBura,
+@article{PfeifferForzaniBura,
     author      = {Pfeiffer, Ruth and Forzani, Liliana and Bura, Efstathia},
     year        = {2012},
     month       = {09},
@@ -170,50 +806,102 @@ abstract = { This survey provides an overview of higher-order tensor decompositi
     doi         = {10.1002/sim.4437}
 }
 
-@Inbook{ApproxKron-VanLoanPitsianis1993,
-    author      = {Van Loan, C. F. and Pitsianis, N.},
-    editor      = {Moonen, Marc S. and Golub, Gene H. and De Moor, Bart L. R.},
-    title       = {Approximation with Kronecker Products},
-    bookTitle   = {Linear Algebra for Large Scale and Real-Time Applications},
-    year        = {1993},
-    publisher   = {Springer Netherlands},
-    address     = {Dordrecht},
-    pages       = {293--314},
-    isbn        = {978-94-015-8196-7},
-    doi         = {10.1007/978-94-015-8196-7_17}
+@article{PfeifferKaplaBura2021,
+    author      = {Pfeiffer, Ruth and Kapla, Daniel and Bura, Efstathia},
+    title       = {{Least squares and maximum likelihood estimation of sufficient reductions in regressions with matrix-valued predictors}},
+    volume      = {11},
+    year        = {2021},
+    journal     = {International Journal of Data Science and Analytics},
+    doi         = {10.1007/s41060-020-00228-y}
 }
 
-@book{asymStats-van_der_Vaart1998,
-    title       = {Asymptotic Statistics},
-    author      = {{van der Vaart}, A.W.},
-    series      = {Asymptotic Statistics},
-    year        = {1998},
-    publisher   = {Cambridge University Press},
-    series      = {Cambridge Series in Statistical and Probabilistic Mathematics},
-    isbn        = {0-521-49603-9}
+@inproceedings{ShashuaHazan2005,
+    author      = {Shashua, Amnon and Hazan, Tamir},
+    title       = {Non-Negative Tensor Factorization with Applications to Statistics and Computer Vision},
+    year        = {2005},
+    isbn        = {1595931805},
+    publisher   = {Association for Computing Machinery},
+    address     = {New York, NY, USA},
+    doi         = {10.1145/1102351.1102451},
+    booktitle   = {Proceedings of the 22nd International Conference on Machine Learning},
+    pages       = {792--799},
+    numpages    = {8},
+    location    = {Bonn, Germany},
+    series      = {ICML '05}
 }
 
-@book{measureTheory-Kusolitsch2011,
-    title       = {{M}a\ss{}- und {W}ahrscheinlichkeitstheorie},
-    subtitle    = {{E}ine {E}inf{\"u}hrung},
-    author      = {Kusolitsch, Norbert},
-    series      = {Springer-Lehrbuch},
-    year        = {2011},
-    publisher   = {Springer Vienna},
-    isbn        = {978-3-7091-0684-6},
-    doi         = {10.1007/978-3-7091-0685-3}
+@article{Soize2008,
+    title       = {Tensor-valued random fields for meso-scale stochastic model of anisotropic elastic microstructure and probabilistic analysis of representative volume element size},
+    author      = {C. Soize},
+    year        = {2008},
+    journal     = {Probabilistic Engineering Mechanics},
+    volume      = {23},
+    number      = {2},
+    pages       = {307-323},
+    note        = {5th International Conference on Computational Stochastic Mechanics},
+    issn        = {0266-8920},
+    doi         = {10.1016/j.probengmech.2007.12.019},
+    url         = {https://www.sciencedirect.com/science/article/pii/S0266892007000562}
 }
 
-@book{optimMatrixMani-AbsilEtAl2007,
-    title       = {{Optimization Algorithms on Matrix Manifolds}},
-    author      = {Absil, P.-A. and Mahony, R. and Sepulchre, R.},
-    year        = {2007},
-    publisher   = {Princeton University Press},
-    isbn        = {9780691132983},
-    note        = {Full Online Text \url{https://press.princeton.edu/absil}}
+@article{SoloveychikTrushin2016,
+    title       = {Gaussian and robust Kronecker product covariance estimation: Existence and uniqueness},
+    author      = {I. Soloveychik and D. Trushin},
+    year        = {2016},
+    journal     = {Journal of Multivariate Analysis},
+    volume      = {149},
+    pages       = {92-113},
+    issn        = {0047-259X},
+    doi         = {10.1016/j.jmva.2016.04.001},
+    url         = {https://www.sciencedirect.com/science/article/pii/S0047259X16300070}
 }
 
-@Inbook{geomMethodsOnLowRankMat-Uschmajew2020,
+@misc{SongHero2023,
+    title       = {On Separability of Covariance in Multiway Data Analysis},
+    author      = {Dogyoon Song and Alfred O. Hero},
+    year        = {2023},
+    eprint      = {2302.02415},
+    archivePrefix   = {arXiv},
+    primaryClass    = {math.ST},
+    doi         = {10.48550/arXiv.2302.02415}
+}
+
+@article{SrivastavaEtAt2008,
+    title       = {Models with a Kronecker product covariance structure: Estimation and testing},
+    author      = {Srivastava, Muni Shanker and von Rosen, Tatjana and von Rosen, Dietrich},
+    year        = {2008},
+    journal     = {Mathematical Methods of Statistics},
+    month       = {Dec},
+    day         = {01},
+    volume      = {17},
+    number      = {4},
+    pages       = {357-370},
+    issn        = {1934-8045},
+    doi         = {10.3103/S1066530708040066}
+}
+
+@book{SrivastavaKhatri1979,
+    title       = {An introduction to multivariate statistics},
+    author      = {Srivastava, Muni Shanker and Khatri, Chinubal G.},
+    year        = {1979},
+    publisher   = {North Holland},
+    isbn        = {0444003029},
+    language    = {eng},
+    address     = {New York, NY [u.a.]}
+}
+
+@article{TsiligkaridisHero2013,
+    author      = {Tsiligkaridis, Theodoros and Hero, Alfred O.},
+    journal     = {IEEE Transactions on Signal Processing},
+    title       = {Covariance Estimation in High Dimensions Via Kronecker Product Expansions},
+    year        = {2013},
+    volume      = {61},
+    number      = {21},
+    pages       = {5347-5360},
+    doi         = {10.1109/TSP.2013.2279355}
+}
+
+@Inbook{Uschmajew2020,
     author      = {Uschmajew, Andr{\'e} and Vandereycken, Bart},
     editor      = {Grohs, Philipp and Holler, Martin and Weinmann, Andreas},
     title       = {Geometric Methods on Low-Rank Matrix and Tensor Manifolds},
@@ -226,168 +914,96 @@ abstract = { This survey provides an overview of higher-order tensor decompositi
     doi         = {10.1007/978-3-030-31351-7_9}
 }
 
-@book{introToSmoothMani-Lee2012,
-    title       = {Introduction to Smooth Manifolds},
-    author      = {Lee, John M.},
-    year        = {2012},
-    journal     = {Graduate Texts in Mathematics},
-    publisher   = {Springer New York},
-    doi         = {10.1007/978-1-4419-9982-5}
+@book{vanderVaart1998,
+    title       = {Asymptotic Statistics},
+    author      = {{van der Vaart}, A.W.},
+    series      = {Asymptotic Statistics},
+    year        = {1998},
+    publisher   = {Cambridge University Press},
+    series      = {Cambridge Series in Statistical and Probabilistic Mathematics},
+    isbn        = {0-521-49603-9}
 }
 
-@book{introToRiemannianMani-Lee2018,
-    title       = {Introduction to Riemannian Manifolds},
-    author      = {Lee, John M.},
-    year        = {2018},
-    journal     = {Graduate Texts in Mathematics},
-    publisher   = {Springer International Publishing},
-    doi         = {10.1007/978-3-319-91755-9}
+@Inbook{VanLoanPitsianis1993,
+    author      = {Van Loan, C. F. and Pitsianis, N.},
+    editor      = {Moonen, Marc S. and Golub, Gene H. and De Moor, Bart L. R.},
+    title       = {Approximation with Kronecker Products},
+    bookTitle   = {Linear Algebra for Large Scale and Real-Time Applications},
+    year        = {1993},
+    publisher   = {Springer Netherlands},
+    address     = {Dordrecht},
+    pages       = {293--314},
+    isbn        = {978-94-015-8196-7},
+    doi         = {10.1007/978-94-015-8196-7_17}
 }
 
-@misc{MLEonManifolds-HajriEtAl2017,
-    title       = {Maximum Likelihood Estimators on Manifolds},
-    author      = {Hajri, Hatem and Said, Salem and Berthoumieu, Yannick},
-    year        = {2017},
-    journal     = {Lecture Notes in Computer Science},
-    publisher   = {Springer International Publishing},
-    pages       = {692-700},
-    doi         = {10.1007/978-3-319-68445-1_80}
+@article{WangEtAt2022,
+    author      = {Yu Wang and Zeyu Sun and Dogyoon Song and Alfred Hero},
+    title       = {{Kronecker-structured covariance models for multiway data}},
+    volume      = {16},
+    journal     = {Statistics Surveys},
+    number      = {none},
+    publisher   = {Amer. Statist. Assoc., the Bernoulli Soc., the Inst. Math. Statist., and the Statist. Soc. Canada},
+    pages       = {238 -- 270},
+    year        = {2022},
+    doi         = {10.1214/22-SS139}
 }
 
-@article{relativity-Einstain1916,
-    author      = {Einstein, Albert},
-    title       = {Die Grundlage der allgemeinen Relativitätstheorie},
-    year        = {1916},
-    journal     = {Annalen der Physik},
-    volume      = {354},
-    number      = {7},
-    pages       = {769-822},
-    doi         = {10.1002/andp.19163540702}
+# arXiv version: https://doi.org/10.48550/arXiv.1811.05076
+@article{WangLi2020,
+    title       = {Learning from Binary Multiway Data: Probabilistic Tensor Decomposition and its Statistical Optimality}, 
+    author      = {Miaoyan Wang and Lexin Li},
+    year        = {2020},
+    journal     = {Journal of Machine Learning Research},
+    volume      = {21},
+    number      = {154},
+    pages       = {1--38}
 }
 
-@article{MultilinearOperators-Kolda2006,
-    title       = {Multilinear operators for higher-order decompositions.},
-    author      = {Kolda, Tamara Gibson},
-    doi         = {10.2172/923081},
-    url         = {https://www.osti.gov/biblio/923081},
-    place       = {United States},
-    year        = {2006},
-    month       = {4},
-    type        = {Technical Report}
-}
-
-@book{aufbauAnalysis-kaltenbaeck2021,
-    title       = {Aufbau Analysis},
-    author      = {Kaltenb\"ack, Michael},
-    isbn        = {978-3-88538-127-3},
-    series      = {Berliner Studienreihe zur Mathematik},
-    edition     = {27},
-    year        = {2021},
-    publisher   = {Heldermann Verlag}
-}
-
-@article{TensorNormalMLE-ManceurDutilleul2013,
-    title       = {Maximum likelihood estimation for the tensor normal distribution: Algorithm, minimum sample size, and empirical bias and dispersion},
-    author      = {Ameur M. Manceur and Pierre Dutilleul},
-    journal     = {Journal of Computational and Applied Mathematics},
-    volume      = {239},
-    pages       = {37-49},
-    year        = {2013},
-    issn        = {0377-0427},
-    doi         = {10.1016/j.cam.2012.09.017},
-    url         = {https://www.sciencedirect.com/science/article/pii/S0377042712003810}
-}
-
-@article{StatIdentTensorGaussian-DeesMandic2019,
-    title       = {A Statistically Identifiable Model for Tensor-Valued Gaussian Random Variables},
-    author      = {Bruno Scalzo Dees and Danilo P. Mandic},
-    journal     = {ArXiv},
-    year        = {2019},
-    volume      = {abs/1911.02915},
-    url         = {https://api.semanticscholar.org/CorpusID:207847615}
-}
-
-@article{Ising-Ising1924,
-    author      = {Ising, Ernst},
-    title       = {{Beitrag zur Theorie des Ferromagnetismus}},
-    journal     = {Zeitschrift f\"ur Physik},
-    pages       = {253-258},
-    volume      = {31},
-    number      = {1},
-    year        = {1924},
-    month       = {2},
-    issn        = {0044-3328},
-    doi         = {10.1007/BF02980577}
-}
-
-
-# TODO: Fix the following!!!
-@book{GraphicalModels-Whittaker2009,
+% TODO: check
+@book{Whittaker2009,
     author      = {J. Whittaker},
     title       = {Graphical Models in Applied Multivariate Statistics},
     publisher   = {Wiley},
     year        = {2009}
 }
 
-@article{MVB-Dai2012,
-    author      = {B. Dai},
-    title       = {Multivariate bernoulli distribution models},
-    year        = {2012}
+@article{ZhangZhou2005,
+    title       = {(2D)2PCA: Two-directional two-dimensional PCA for efficient face representation and recognition},
+    journal     = {Neurocomputing},
+    volume      = {69},
+    number      = {1},
+    pages       = {224-231},
+    year        = {2005},
+    note        = {Neural Networks in Signal Processing},
+    issn        = {0925-2312},
+    doi         = {https://doi.org/10.1016/j.neucom.2005.06.004},
+    url         = {https://www.sciencedirect.com/science/article/pii/S0925231205001785},
+    author      = {Daoqiang Zhang and Zhi-Hua Zhou}
 }
 
-@article{MVB-DaiDingWahba2013,
-    author      = {B. Dai, S. Ding, and G. Wahba},
-    title       = {Multivariate bernoulli distribution},
-    year        = {2013}
+@article{ZhouLi2014,
+    author      = {Zhou, Hua and Li, Lexin},
+    title       = {Regularized matrix regression},
+    journal     = {Journal of the Royal Statistical Society. Series B (Statistical Methodology)},
+    volume      = {76},
+    number      = {2},
+    pages       = {463--483},
+    year        = {2014},
+    publisher   = {[Royal Statistical Society, Wiley]}
 }
 
-@article{sdr-mixedPredictors-BuraForzaniEtAl2022,
-    author      = {Bura and Forzani and TODO},
-    title       = {Sufficient reductions in regression with mixed predictors},
-    journal     = {},
-    volume      = {},
-    number      = {},
-    year        = {2022}
-}
 
-@article{sparseIsing-ChengEtAt2014,
-    author        = {J. Cheng, E. Levina, and J. Wang, P.and Zhu},
-    title         = {A sparse Ising model with covariates},
-    journal       = {},
-    volume        = {},
-    number        = {},
-    year          = {2014},
-    doi           = {10.1111/biom.12202}
-}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%                         Software, data Sets, ...                         %%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-@book{deeplearningbook-GoodfellowEtAl2016,
-    title         = {Deep Learning},
-    author        = {Ian Goodfellow and Yoshua Bengio and Aaron Courville},
-    publisher     = {MIT Press},
-    url           = {\url{http://www.deeplearningbook.org}},
-    year          = {2016}
-}
-
-@misc{rmsprop-Hinton2012,
-    title         = {Neural networks for machine learning},
-    author        = {Hinton, G.},
-    year          = {2012}
-}
-
-@article{self-kapla2019,
-    title         = {Comparison of Different Word Embeddings and Neural Network Types for Sentiment Analysis of German Political Speeches},
-    author        = {Kapla, Daniel},
-    year          = {2019}
-}
-
-@article{MGCCA-GirkaEtAl2024,
-    title         = {Tensor generalized canonical correlation analysis},
-    author        = {Fabien Girka and Arnaud Gloaguen and Laurent {Le Brusquet} and Violetta Zujovic and Arthur Tenenhaus},
-    year          = {2024},
-    journal       = {Information Fusion},
-    volume        = {102},
-    issn          = {1566-2535},
-    doi           = {10.1016/j.inffus.2023.102045}
+@misc{lichess-database,
+    author      = {Thibault Duplessis},
+    title       = {lichess.org open database},
+    year        = {2013},
+    url         = {https://database.lichess.org},
+    note        = {visited on December 8, 2023},
 }
 
 @Article{Rdimtools,
@@ -401,12 +1017,11 @@ abstract = { This survey provides an overview of higher-order tensor decompositi
     doi           = {10.1016/j.simpa.2022.100414},
 }
 
-@misc{lichess-database,
-  author       = {{Thibault Duplessis}},
-  title        = {lichess.org open database},
-  year         = {2013},
-  url          = {https://database.lichess.org},
-  note         = {visited on December 8, 2023},
+@mish{schachhoernchen,
+    title    = {Schach H\"ornchen},
+    year     = {development since 2021, first release pending},
+    author   = {Daniel Kapla},
+    url      = {todo!}
 }
 
 @misc{stockfish,
@@ -416,10 +1031,3 @@ abstract = { This survey provides an overview of higher-order tensor decompositi
     url      = {https://stockfishchess.org/},
     abstract = {Stockfish is a free and strong UCI chess engine.},
 }
-
-@mish{schachhoernchen,
-    title    = {Schach H\"ornchen},
-    year     = {development since 2021, first release pending},
-    author   = {Kapla, Daniel},
-    url      = {todo!}
-}
diff --git a/LaTeX/paper.tex b/LaTeX/paper.tex
index fb7a349..feba01e 100644
--- a/LaTeX/paper.tex
+++ b/LaTeX/paper.tex
@@ -20,9 +20,10 @@
     backrefstyle = three+,      % summarise pages, e.g. p. 2f, 6ff, 7-10
     date = year,                % date format
     % backend = biber,
-    giveninits = true,          % abbreviate first name
+    giveninits = false,         % abbreviate first name
     clearlang = false,          % show language?
-    uniquename = init,
+    uniquename = false,
+    sortcites = false,
     natbib = false,
     dashed = true,
     url = false,
@@ -101,7 +102,7 @@
 % Define math macros
 \renewcommand{\hat}{\widehat}
 % \newcommand*{\ten}[1]{\mathcal{#1}}
-\DeclareMathOperator{\sym}{sym}
+% \DeclareMathOperator{\sym}{sym}
 \renewcommand*{\vec}{\operatorname{vec}}
 \newcommand*{\unvec}{\operatorname{vec^{-1}}}
 \newcommand*{\reshape}[1]{\operatorname{reshape}_{#1}}
@@ -258,28 +259,46 @@
 %------------------------------------------------------------------------------%
 \section{Introduction}
 
+In Statistics, tensors are a mathematical tool to represent data of complex structure. In this paper, \textit{tensors} are considered as a generalization of matrices to higher dimensions: A tensor is a multi-dimensional array of numbers. For example, a second-order tensor can be represented as a matrix, while a third-order tensor can be represented as a cube of matrices. 
+
+Complex data are collected at different times and/or under several conditions often involving a large number of multi-indexed variables represented as tensor-valued data \parencite{KoldaBader2009}. They occur in large-scale longitudinal studies \parencite[e.g.[]{Hoff2015}, in agricultural experiments and chemometrics and spectroscopy \parencite[e.g.[]{LeurgansRoss1992,Burdick1995}. Also, in signal and video processing where sensors produce multi-indexed data, e.g. over spatial, frequency, and temporal dimensions \parencite[e.g.]{DeLathauwerCastaing2007,KofidisRegalia2005}, in telecommunications \parencite{DeAlmeidaEtAl2007}.
+Examples of multiway data include 3D images of the brain, where the modes are the 3 spatial dimensions,
+and spatio-temporal weather imaging data, a set of image sequences represented as 2 spatial modes and 1 temporal mode.
+
+%In fact, it will be shown that scientific research cannot do without multiway analysis, even though not everybody knows this yet.
+
+
+
+% A nice work, particularly focusing on the need to go from matrix-variate to tensor-based MLN distribution, is given in \textcite{BasserPajevic2003}. They genuinely argue why a vectorial treatment of a complex data set which actually needs a tensorial treatment and the application of multilinear normality, can lead to wrong or inefficient conclusions. For some more relevant work in the same direction, see \textcite{BasserPajevic2000,BasserPajevic2007,LeBihanEtAl2001}, and the references cited therein, whereas a Bayesian perspective is given in \textcite{MartinFernandez2004}; see also \textcite{Hoff2011}. Analysis of multilinear, particularly trilinear data, has a specific attraction in chemometrics and spectroscopy; see for example \textcite{LeurgansRoss1992}, \textcite{Burdick1995}. Other areas of applications include signal processing \textcite{KofidisRegalia2005}, morphometry \textcite{LeporeEtAl2008}, geostatistics \textcite{LiuKoike2007}, and statistical mechanics \textcite{Soize2008}, to mention a few. The extensive use of tensor variate analysis in these and other similar fields has generated a special tensorial nomenclature, for example diffusion tensor, dyadic tensor, stress and strain tensors etc. \textcite{McCullagh1987}. Similarly, special tensorial decompositions, for example PARAFAC and Tucker decompositions \textcite{Kroonenberg2008}, have been developed; for a general comprehensive review of tensor decompositions and their various applications, see \textcite{Comon2002,KoldaBader2009,ShashuaHazan2005}.
+
+%From \textcite{SongHero2023}:
+%Modern datasets and their associated probabilistic models often exhibit a multiway nature, involving a massive number of multi-indexed variables represented as a tensor \parencite{KoldaBader2009}. This structure poses significant challenges for data scientists due to the large number of variables (exponential with respect to the tensor order). For example, an order-3 tensor of size $d \times d \times d$ involves $d^3$ variables, and representing a real-valued tensor with $d \ge 10^4$ already requires more than 1TB of memory. As data tensors grow in size, storage and processing complexities for the large data arrays increase quadratically in relation to the data size.
+
+
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Notation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-We start by introducing the notation we use throughout the paper. Let $\ten{A} = (\ten{A}_{i_1, \ldots, i_r})\in\mathbb{R}^{q_1\times \ldots\times q_r}$ be an order\footnote{Also referred to as rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the rank as in ``the rank of a matrix''.} $r$ tensor, where $r\in\mathbb{N}$ is the number of modes or axes (dimensions) of $\ten{A}$ and $\ten{A}_{i_1,...,i_r} \in \mathbb{R}$ is its $(i_1, \ldots, i_r)$th entry. For example, a $p \times q$ matrix  $\mat{B}$ has two modes, the rows and columns.  For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times q_k}$, $k\in[r] = \{1, 2, \ldots, r\}$, the \emph{multi-linear multiplication}, or \emph{Tucker Operator} \parencite{MultilinearOperators-Kolda2006}, is defined element wise as
+%We  introduce the notation throughout the paper. 
+$\ten{A} = (\ten{A}_{i_1, \ldots, i_r})\in\mathbb{R}^{q_1\times \ldots\times q_r}$ denotes an order\footnote{Also referred to as rank, therefore the variable name $r$, but this term is \emph{not} used as it leads to confusion with the concept of rank of a matrix.} $r$ tensor, where $r\in\mathbb{N}$ is the number of modes or axes (dimensions) of $\ten{A}$ and $\ten{A}_{i_1,...,i_r} \in \mathbb{R}$ is its $(i_1, \ldots, i_r)$th entry. For example, a $p \times q$ matrix  $\mat{B}$ has two modes, the rows and columns.  For matrices $\mat{B}_k\in\mathbb{R}^{p_k\times q_k}$, $k\in[r] = \{1, 2, \ldots, r\}$, the \emph{multi-linear multiplication}, or \emph{Tucker operator} \parencite{Kolda2006}, is defined element wise as
 \begin{displaymath}
     (\ten{A}\times\{\mat{B}_1, \ldots, \mat{B}_r\})_{j_1, \ldots, j_r} = \sum_{i_1, \ldots, i_r = 1}^{q_1, \ldots, q_r} \ten{A}_{i_1, \ldots, i_r}(\mat{B}_{1})_{j_1, i_1} \cdots (\mat{B}_{r})_{j_r, i_r}
 \end{displaymath}
-which results in an order $r$ tensor of dimension $p_1\times ...\times p_k$. This results in the \emph{$k$-mode product} between the tensor $\ten{A}$ with the matrix $\mat{B}_k$,
+which results in an order $r$ tensor of dimension $p_1\times ...\times p_k$. The \emph{$k$-mode product} of the tensor $\ten{A}$ with the matrix $\mat{B}_k$ is 
 \begin{displaymath}
     \ten{A}\times_k\mat{B}_k = \ten{A}\times\{\mat{I}_{q_1}, \ldots, \mat{I}_{q_{k-1}}, \mat{B}_{k}, \mat{I}_{q_{k+1}}, \ldots, \mat{I}_{q_r}\}.
 \end{displaymath}
-Furthermore, the notation $\ten{A}\mlm_{k\in S}\mat{B}_k$ is short hand for the iterative application of the mode product for all indices in $S\subseteq[r]$. For example $\ten{A}\times_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set, this notation is unambiguous, because the mode product commutes for different modes: $j\neq k\Rightarrow\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$.
+The notation $\ten{A}\mlm_{k\in S}\mat{B}_k$ is short hand for the iterative application of the mode product for all indices in $S\subseteq[r]$. For example $\ten{A}\times_{k\in\{2, 5\}}\mat{B}_k = \ten{A}\times_2\mat{B}_2\times_5\mat{B}_5$. By only allowing $S$ to be a set, this notation is unambiguous because the mode product commutes for different modes; i.e., $\ten{A}\times_j\mat{B}_j\times_k\mat{B}_k = \ten{A}\times_k\mat{B}_k\times_j\mat{B}_j$ for $j\neq k$.
 
 %Matrices and tensors can be \emph{vectorized} by the \emph{vectorization} operator $\vec$. 
-The operator $\vec$ maps an array to a vector. For example,  $\vec(\mat{B})$ stands for the $pq \times 1$ vector of the $p \times q$ matrix $\mat{B}$ resulting from stacking the columns of $\mat{B}$ one after the other.  For a tensor $\ten{A}= (a_{i_1,\ldots,i_r})$ of order $r$ and dimensions $q_1, \ldots, q_r$, $\vec(\ten{A})$ is the $q_1 q_2 \ldots q_r \times 1$ vector with the elements of $\ten{A}$ stacked one after the other in the specified order $r$ then $r-1$, and so on. For example, if $\ten{A}$ is 3-dimensional array, $\vec(\ten{A})=(\vec(\ten{A}(:,:,1))^T,\vec(\ten{A}(:,:,2)^T,\ldots,\vec(\ten{A}(:,:,q_r)^T)^T$. We use the notation $\ten{A}\equiv \ten{B}$ for objects $\ten{A}, \ten{B}$ of any shape if and only if $\vec(\ten{A}) = \vec(\ten{B})$.
+The operator $\vec$ maps an array to a vector. Specifically,  $\vec(\mat{B})$ stands for the $pq \times 1$ vector of the $p \times q$ matrix $\mat{B}$ resulting from stacking the columns of $\mat{B}$ one after the other.  For a tensor $\ten{A}= (\ten{A}_{i_1,\ldots,i_r})$ of order $r$ and dimensions $q_1, \ldots, q_r$, $\vec(\ten{A})$ is the $q_1 q_2 \ldots q_r \times 1$ vector with the elements of $\ten{A}$ stacked one after the other in the order $r$ then $r-1$, and so on. For example, if $\ten{A}$ is a 3-dimensional array, $\vec(\ten{A})=(\vec(\ten{A}(:,:,1))^T,\vec(\ten{A}(:,:,2)^T,\ldots,\vec(\ten{A}(:,:,q_r)^T)^T$. We use the notation $\ten{A}\equiv \ten{B}$ for objects $\ten{A}, \ten{B}$ of any shape if and only if $\vec(\ten{A}) = \vec(\ten{B})$.
 
 The \emph{inner product} between two tensors of the same order and dimensions is
 \begin{displaymath}
     \langle\ten{A}, \ten{B}\rangle = \sum_{i_1, \ldots, i_r} \ten{A}_{i_1, \ldots, i_r}\ten{B}_{i_1, \ldots, i_r}
 \end{displaymath}
-that leads to the definition of the \emph{Frobenius Norm} for tensors, $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$ and is the straightforward  extension of the Frobenius norm for matrices and vectors. %are also used for matrices while for a vector $\mat{a}$ the \emph{2 norm} is $\|\mat{a}\|_2 = \sqrt{\langle\mat{a}, \mat{a}\rangle}$.
-The \emph{outer product} between two tensors $\ten{A}$ of dimensions $q_1, \ldots, q_r$ and $\ten{B}$ of dimensions $p_1, \ldots, p_l$ is a tensor $\ten{A}\circ\ten{B}$ of order $r + l$ and dimensions $q_1, \ldots, q_r, p_1, \ldots, p_l$ such that
+This leads to the definition of the \emph{Frobenius norm} for tensors, $\|\ten{A}\|_F = \sqrt{\langle\ten{A}, \ten{A}\rangle}$ and is the straightforward  extension of the Frobenius norm for matrices and vectors. %are also used for matrices while for a vector $\mat{a}$ the \emph{2 norm} is $\|\mat{a}\|_2 = \sqrt{\langle\mat{a}, \mat{a}\rangle}$.
+The \emph{outer product} between two tensors $\ten{A}$ of dimensions $q_1, \ldots, q_r$ and $\ten{B}$ of dimensions $p_1, \ldots, p_l$ is a tensor $\ten{A}\circ\ten{B}$ of order $r + l$ and dimensions $q_1, \ldots, q_r, p_1, \ldots, p_l$, such that
 \begin{displaymath}
     \ten{A}\circ\ten{B} \equiv (\vec\ten{A})\t{(\vec{\ten{B}})}.
 \end{displaymath}
@@ -287,185 +306,74 @@ Let $\K : \mathbb{R}^{q_1, ..., q_{2 r}}\to\mathbb{R}^{q_1 q_{r + 1}, ..., q_r q
 \begin{align*}
     \K(\ten{A})_{i_1 + 1, ..., i_r + 1} &= \ten{A}_{\lfloor i_1 / q_{r + 1}\rfloor + 1, ..., \lfloor i_r / q_{2 r} \rfloor + 1, (i_1\operatorname{mod}q_{r + 1}) + 1, ..., (i_r\operatorname{mod}q_{2 r}) + 1}
 \end{align*}
-where $\lfloor\,.\,\rfloor$ is the floor operation and $a\operatorname{mod}b$ is the integer divition remainder of $a / b$. The mapping $\K$ is a linear operation and maps an order $2 r$ tensor to an order $r$ tensor by reshaping and permuting its elements. This operation allows to define a generalization of the \emph{Kronecker product} which we define as $\ten{A}\otimes\ten{B} = \K(\ten{A}\circ\ten{B})$.
+where $\lfloor\,.\,\rfloor$ is the floor operation and $a\operatorname{mod}b$ is the integer division remainder of $a / b$. The mapping $\K$ is a linear operation and maps an order $2 r$ tensor to an order $r$ tensor by reshaping and permuting its elements. This operation allows  defining a generalization of the \emph{Kronecker product} to tensors, which we define as $\ten{A}\otimes\ten{B} = \K(\ten{A}\circ\ten{B})$.
 
 For tensors of order at least $2$, the \emph{flattening} (or \emph{unfolding} or \emph{matricization}) is a reshaping of the tensor into a matrix along a particular mode. For a tensor $\ten{A}$ of order $r$ and dimensions $q_1, \ldots, q_r$, the $k$-mode unfolding $\ten{A}_{(k)}$ is a $q_k\times \prod_{l=1, l\neq k}q_l$ matrix with %For the tensor $\ten{A} = (a_{i_1,\ldots,i_r})\in\mathbb{R}^{q_1, \ldots, q_r}$ the 
 elements %of the $k$ unfolded tensor $\ten{A}_{(k)}$ are
 \begin{displaymath}
     (\ten{A}_{(k)})_{i_k, j} = \ten{A}_{i_1, \ldots, i_r}\quad\text{ with }\quad j = 1 + \sum_{\substack{l = 1\\l \neq k}}^r (i_l - 1) \prod_{\substack{m = 1\\m\neq k}}^{l - 1}q_m.
 \end{displaymath}
+Illustration examples of vectorization and matricization are given in Appendix \ref{app:examples}. 
 % The rank of a tensor $\ten{A}$ of dimensions $q_1\times ...\times q_r$ is vector-valued; that is, $\rank(\ten{A}) = (a_1, \ldots, a_r)\in[q_1]\times...\times[q_r]$, where $a_k = \rank(\ten{A}_{(k)})$ is the usual matrix rank of the $k$ unfolded tensor.
 
-The gradient of a function $\ten{F}(\ten{X})$ of any shape, univariate, multivariate or tensor valued, with argument $\ten{X}$ of any shape is defined as
+The gradient of a function $\ten{F}(\ten{X})$ of any shape, univariate, multivariate or tensor valued, with argument $\ten{X}$ of any shape is defined as the $p\times q$ matrix 
 \begin{displaymath}
-    \nabla_{\ten{X}}\ten{F} = \frac{\partial\t{(\vec\ten{F}(\ten{X}))}}{\partial(\vec\ten{X})}
+    \nabla_{\ten{X}}\ten{F} = \frac{\partial\t{(\vec\ten{F}(\ten{X}))}}{\partial(\vec\ten{X})},
 \end{displaymath}
-which is a matrix of dimension $p\times q$ where the vectorized quantities $\vec{\ten{X}}\in\mathbb{R}^p$ and $\vec\ten{F}(\ten{X})\in\mathbb{R}^q$. This is consistent with the gradient of a real-valued function $f(\mat{x})$ where $\mat{x}\in\mathbb{R}^p$ as $\nabla_{\mat{x}}f\in\mathbb{R}^{p\times 1}$. \todo{Maybe reference magnus and abadir, magnus and neudecker?!}
+where the vectorized quantities $\vec{\ten{X}}\in\mathbb{R}^p$ and $\vec\ten{F}(\ten{X})\in\mathbb{R}^q$. This is consistent with the gradient of a real-valued function $f(\mat{x})$ where $\mat{x}\in\mathbb{R}^p$ as $\nabla_{\mat{x}}f\in\mathbb{R}^{p\times 1}$ \parencite[ch. 15]{Harville1997}. 
+%\todo{Maybe reference magnus and abadir, magnus and neudecker?!}
 
 
+%\todo{I think the following examples are a good idea for the appendix.}
 
-\todo{$\vech\ten{A}$ the half vectorization! Define via the vector containing the tensor elements with indices in \emph{reflected lexicographic order}? Also don't forget to figure out how to (if even) to define the half vectorization of a tensor (with all modes of the same dimensions)}
 
-\todo{$\sym{\ten{A}}$ tensor needed?!}
-
-\todo{I think the following examples are a good idea for the appendix.}
-
-\begin{example}[Vectorization]\label{ex:vectorization}
-    Given a matrix
-    \begin{displaymath}
-        \mat{A} = \begin{pmatrix}
-            1 & 4 & 7 \\
-            2 & 5 & 8 \\
-            3 & 6 & 9
-        \end{pmatrix}
-    \end{displaymath}
-    its vectorization is $\vec{\mat{A}} = \t{(1, 2, 3, 4, 5, 6, 7, 8, 9)}$ and its half vectorization $\vech{\mat{A}} = \t{(1, 2, 3, 5, 6, 9)}$. Let a $\ten{A}$ be a tensor of dimension $3\times 3\times 3$ given by
-    \begin{displaymath}
-        \ten{A}_{:,:,1} = \begin{pmatrix}
-            1 & 4 & 7 \\
-            2 & 5 & 8 \\
-            3 & 6 & 9
-        \end{pmatrix},
-        \qquad
-        \ten{A}_{:,:,2} = \begin{pmatrix}
-            10 & 13 & 16 \\
-            11 & 14 & 17 \\
-            12 & 15 & 18
-        \end{pmatrix},
-        \qquad
-        \ten{A}_{:,:,3} = \begin{pmatrix}
-            19 & 22 & 25 \\
-            20 & 23 & 26 \\
-            21 & 24 & 27
-        \end{pmatrix}.
-    \end{displaymath}
-    Then the vectorization of $\ten{A}$ is given by
-    \begin{displaymath}
-        \vec{\ten{A}} = \t{(1, 2, 3, 4, ..., 26, 27)}\in\mathbb{R}^{27}
-    \end{displaymath}
-    while the half vectorization is
-    \begin{displaymath}
-        \vech{\ten{A}} = \t{(1, 2, 3, 5, 6, 9, 11, 12, 15, 21)}\in\mathbb{R}^{10}.
-    \end{displaymath}
-\end{example}
-
-\begin{example}[Matricization]
-    Let $\ten{A}$ be the $3\times 4\times 2$ tensor given by
-    \begin{displaymath}
-        \ten{A}_{:,:,1} = \begin{pmatrix}
-            1 & 4 & 7 & 10 \\
-            2 & 5 & 8 & 11 \\
-            3 & 6 & 9 & 12
-        \end{pmatrix},
-        \ten{A}_{:,:,2} = \begin{pmatrix}
-            13 & 16 & 19 & 22 \\
-            14 & 17 & 20 & 23 \\
-            15 & 18 & 21 & 24
-        \end{pmatrix}.
-    \end{displaymath}
-    Its matricizations are then
-    \begin{gather*}
-        \ten{A}_{(1)} = \begin{pmatrix}
-            1 &  4 &  7 & 10 & 13 & 16 & 19 & 22 \\
-            2 &  5 &  8 & 11 & 14 & 17 & 20 & 23 \\
-            3 &  6 &  9 & 12 & 15 & 18 & 21 & 24
-        \end{pmatrix},
-        \qquad
-        \ten{A}_{(2)} = \begin{pmatrix}
-             1 &  2 &  3 & 13 & 14 & 15 \\
-             4 &  5 &  6 & 16 & 17 & 18 \\
-             7 &  8 &  9 & 19 & 20 & 21 \\
-            10 & 11 & 12 & 22 & 23 & 24
-        \end{pmatrix}, \\
-        \ten{A}_{(3)} = \begin{pmatrix}
-             1 &  2 &  3 &  4 &  5 &  6 &  7 &  8 &  9 &  10 &  11 &  12 \\
-            13 & 14 & 15 & 16 & 17 & 18 & 19 & 20 & 21 &  22 &  23 &  24
-        \end{pmatrix}.
-    \end{gather*}
-\end{example}
-
-% \begin{example}[Symmetrization]
-%     Let $\ten{A}$ be the $3\times 3\times 3$ tensor from \cref{ex:vectorization}, then the symmetrization of $\ten{A}$ is
-%     \begin{align*}
-%         (\sym{\ten{A}})_{:,:,1} &= \frac{1}{6}\begin{pmatrix}
-%              6 & 32 &  58 \\
-%             32 & 58 &  84 \\
-%             58 & 84 & 110
-%         \end{pmatrix}, \\
-%         (\sym{\ten{A}})_{:,:,2} &= \frac{1}{6}\begin{pmatrix}
-%             32 &  58 &  84 \\
-%             58 &  84 & 110 \\
-%             84 & 110 & 136
-%         \end{pmatrix}, \\
-%         (\sym{\ten{A}})_{:,:,3} &= \frac{1}{6}\begin{pmatrix}
-%              58 &  84 & 110 \\
-%              84 & 110 & 136 \\
-%             110 & 136 & 162
-%         \end{pmatrix}.
-%     \end{align*}
-% \end{example}
-
-% \begin{example}[Half Vectorization]
-%     The half vectorization of a square matrix
-%     \begin{displaymath}
-%         \mat{A} = \begin{pmatrix}
-%             1 & 4 & 7 \\
-%             2 & 5 & 8 \\
-%             3 & 6 & 9
-%         \end{pmatrix}
-%     \end{displaymath}
-%     is
-%     \begin{displaymath}
-%         \vech{\mat{A}} = (1, 2, 3, 5, 6, 9).
-%     \end{displaymath}
-% \end{example}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Problem Formulation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-Our goal is to identify the cumulative distribution function (cdf) $F$ of $Y\mid \ten{X}$, where $\ten{X}$ is assumed to admit $r$-tensor structure of dimension $p_1\times ... \times p_r$ with continuous or discrete entries and there is no constraint in the form of $Y$. The predictor $\ten{X}$ is a complex object; to simplify the problem we assume their exists a function $\ten{R}:\ten{X}\mapsto \ten{R}(\ten{X})$ such that $\ten{R}(\ten{X})$ is tensor valued of lower dimension so that
+Our goal is to infer the cumulative distribution function (cdf) $F$ of $Y\mid \ten{X}$, where $\ten{X}$ is assumed to admit $r$-tensor structure of dimension $p_1\times ... \times p_r$ with continuous or discrete entries and the response  $Y$ is unconstrained. The predictor $\ten{X}$ is a complex object; to simplify the problem we assume there exists a tensor valued  function of lower dimension $\ten{R}:\ten{X}\mapsto \ten{R}(\ten{X})$ such that 
 \begin{displaymath}
     F(Y\mid \ten{X}) = F(Y\mid \ten{R}(\ten{X})).
 \end{displaymath}
 Since $\ten{R}(\ten{X})$ replaces the predictors without any effect in the conditional cdf of $Y\mid \ten{X}$, it is a \emph{sufficient reduction} for the regression $Y\mid\ten{X}$. We assume the tensor valued $\ten{R}(\ten{X})$ has dimension  $q_1\times...\times q_r$ with $q_j\leq p_j$, $j = 1, ..., r$, which represents a dimension reduction along every mode of $\ten{X}$. This formulation is flexible as it allows, for example, to select ``important'' modes by reducing ``unimportant'' modes to be $1$ dimensional.
 
-To find such a reduction $\ten{R}$, we leverage the equivalence pointed out in \textcite{FisherLectures-Cook2007},
+To find such a reduction $\ten{R}$, we leverage the equivalence relation pointed out in \textcite{Cook2007},
 \begin{equation}\label{eq:inverse-regression-sdr}
     Y\mid\ten{X} \sim Y\mid \ten{R}(\ten{X})
         \quad\Longleftrightarrow\quad
-    \ten{X}\mid(Y, \ten{R}(\ten{X})) \sim \ten{X}\mid\ten{R}(\ten{X}),
+    \ten{X}\mid(Y, \ten{R}(\ten{X})) \sim \ten{X}\mid\ten{R}(\ten{X}).
 \end{equation}
-which means that a \textit{sufficient statistic} $\ten{R}(\ten{X})$ for $Y$ in the inverse regression $\ten{X}\mid Y$, where $Y$ is considered as a parameter indexing the model, is also a sufficient reduction for $\ten{X}$ in the forward regression $Y\mid\ten{X}$. The equivalent inverse regression in \eqref{eq:inverse-regression-sdr} provides exhaustive characterization of $\ten{R}(\ten{X})$. 
+According to \eqref{eq:inverse-regression-sdr}, a \textit{sufficient statistic} $\ten{R}(\ten{X})$ for $Y$ in the inverse regression $\ten{X}\mid Y$, where $Y$ is considered as a parameter indexing the model, is also a \textit{sufficient reduction} for $\ten{X}$ in the forward regression $Y\mid\ten{X}$. %The equivalent inverse regression in \eqref{eq:inverse-regression-sdr} provides exhaustive characterization of $\ten{R}(\ten{X})$. 
 
-The usual tool to identify sufficient statistics is the factorization theorem that requires a distributional model. Here we assume the distribution of $\ten{X}\mid Y$ belongs to  the \emph{quadratic exponential family} in order to (a) simplify modeling and (b) keep estimation feasible. An important feature of the \emph{quadratic exponential family} is that its members are characterized by their first two moments. Specifically, we assume that $\ten{X}\mid Y$ is a full rank quadratic exponential family with density
+The factorization theorem  is the usual tool to identify sufficient statistics and requires a distributional model. In this paper, we assume the distribution of $\ten{X}\mid Y$ belongs to  the \emph{quadratic exponential family} in order to (a) simplify modeling and (b) keep estimation feasible.  We assume that $\ten{X}\mid Y$ is a full rank quadratic exponential family with density
 \begin{align}
 f_{\mat{\eta}_y}(\ten{X}\mid Y = y)
     &= h(\ten{X})\exp(\t{\mat{\eta}_y}\mat{t}(\ten{X}) - b(\mat{\eta}_y)) \nonumber \\
     &= h(\ten{X})\exp(\langle \mat{t}_1(\ten{X}), \mat{\eta}_{1y} \rangle + \langle \mat{t}_2(\ten{X}), \mat{\eta}_{2y} \rangle - b(\mat{\eta}_{y})) \label{eq:quad-density}
 \end{align}
-where $\mat{t}_1(\ten{X})=\vec \ten{X}$ and $\mat{t}_2(\ten{X})$ is linear in $\ten{X}\circ\ten{X}$. The dependence of $\ten{X}$ on $Y$ is fully captured in the natural parameter $\mat{\eta}_y$. The function $h$ is non-negative real-valued. For $b$ we assume it is at least twice continuously differentiable and structly convex.
-Distributions within the quadratic exponential family include the \emph{tensor normal} \todo{cite, if can be found} and \emph{tensor Ising model} \todo{cite} (a generalization of the (inverse) Ising model which is multi-variate Bernoulli with up to second order interactions) and mixtures of these two.
+where $\mat{t}_1(\ten{X})=\vec \ten{X}$ and $\mat{t}_2(\ten{X})$ is linear in $\ten{X}\circ\ten{X}$. The dependence of $\ten{X}$ on $Y$ is fully captured in the natural parameter $\mat{\eta}_y$. The function $h$ is non-negative real-valued and $b$ is assumed to be at least twice continuously differentiable and strictly convex. An important feature of the \emph{quadratic exponential family} is that the distribution of its members is fully characterized by their first two moments. Distributions within the quadratic exponential family include the \emph{tensor normal} and \emph{tensor Ising model} \todo{cite} (a generalization of the (inverse) Ising model which is multi-variate Bernoulli with up to second order interactions) and mixtures of these two. 
+
+The tensor normal distribution using the Tucker operator convention has appeared in several papers \parencite{ManceurDutilleul2013,Hoff2011,OhlsonEtAl2013} representation of tensors appeared in the doctoral thesis of Dutilleul 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{The Generalized Multi-Linear Model}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-In model \eqref{eq:quad-density}, the relationship of $\ten{X}$ and $Y$ is absorbed in $\mat{\eta}_y$, and $\mat{t}(\ten{X})$ is the minimal sufficient statistic for the \textit{pseudo}-parameter\footnote{$\mat{\eta}_y$ is a function of the response $Y$, thus it is not a parameter in the formal statistical sense. It is considered as a parameter when using the equivalence in \eqref{eq:inverse-regression-sdr} and view  $Y$ as a parameter as a device to derive the sufficient reduction from the inverse regression.} $\mat{\eta}_y = (\mat{\eta}_{1y}, \mat{\eta}_{2y})$ with
+In model \eqref{eq:quad-density}, the dependence of $\ten{X}$ and $Y$ is absorbed in $\mat{\eta}_y$, and $\mat{t}(\ten{X})$ is the minimal sufficient statistic for the \textit{pseudo}-parameter\footnote{$\mat{\eta}_y$ is a function of the response $Y$, thus it is not a parameter in the formal statistical sense. It is considered as a parameter when using the equivalence in \eqref{eq:inverse-regression-sdr} and view  $Y$ as a parameter as a device to derive the sufficient reduction from the inverse regression.} $\mat{\eta}_y = (\mat{\eta}_{1y}, \mat{\eta}_{2y})$ with
 \begin{align}\label{eq:t-stat}
     \mat{t}(\ten{X}) &= (\mat{t}_1(\ten{X}),\mat{t}_2(\ten{X}))=(\vec{\ten{X}}, \mat{T}_2\vech((\vec\ten{X})\t{(\vec\ten{X})})),
 \end{align}
-where the $d\times p(p + 1) / 2$ dimensional matrix $\mat{T}_2$ with $p = \prod_{i = 1}^r p_i$ ensures that $\mat{\eta}_{2y}$ is of minimal dimension $d$. The matrix $\mat{T}_2$ is of full rank $d$ and is unique to specific members of the quadratic exponential family.
-
+where the $d\times p(p + 1) / 2$ dimensional matrix $\mat{T}_2$ with $p = \prod_{i = 1}^r p_i$ ensures that $\mat{\eta}_{2y}$ is of minimal dimension $d$. The matrix $\mat{T}_2$ is of full rank $d$ and unique to different members of the quadratic exponential family.
 We can reexpress the exponent in \eqref{eq:quad-density} as
 \begin{align*}
     \t{\mat{\eta}_y} \mat{t}(\ten{X})
         &= \langle \vec \ten{X}, \mat{\eta}_{1y} \rangle + \langle \mat{T}_2\vech(\ten{X}\circ\ten{X}), \mat{\eta}_{2y} \rangle = \langle \vec \ten{X}, \mat{\eta}_{1y} \rangle + \langle \vec(\ten{X}\circ\ten{X}), \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y} \rangle
 \end{align*}
-where $\mat{D}_p$ is the \emph{duplication matrix} from \textcite[Ch.~11]{MatrixAlgebra-AbadirMagnus2005}, defined so that $\mat{D}_p\vech \mat{A} = \vec \mat{A}$ for every symmetric $p\times p$ matrix $\mat{A}$, and $\pinv{\mat{D}_p}$ is its Moore-Penrose pseudo inverse. The first natural parameter component, $\mat{\eta}_{1y}$, captures the first order, and $\mat{\eta}_{2y}$, the second order relationship of $Y$ and $\ten{X}$. The quadratic exponential density of $\ten{X} \mid Y$ can then be expressed as
+where $\mat{D}_p$ is the \emph{duplication matrix} from \textcite[Ch.~11]{AbadirMagnus2005}, defined so that $\mat{D}_p\vech \mat{A} = \vec \mat{A}$ for every symmetric $p\times p$ matrix $\mat{A}$, and $\pinv{\mat{D}_p}$ is its Moore-Penrose pseudo inverse. The first natural parameter component, $\mat{\eta}_{1y}$, captures the first order, and $\mat{\eta}_{2y}$, the second order relationship of $Y$ and $\ten{X}$. The quadratic exponential density of $\ten{X} \mid Y$ can then be expressed as
 \begin{equation}\label{eq:quadratic-exp-fam}
-    f_{\eta_y}(\ten{X}\mid Y = y) = h(\ten{X})\exp(\langle \vec \ten{X}, \mat{\eta}_{1y} \rangle + \langle \vec(\ten{X}\circ\ten{X}), \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y} \rangle - b(\mat{\eta}_y))
+    f_{\eta_y}(\ten{X}\mid Y = y) = h(\ten{X})\exp\left(\langle \vec \ten{X}, \mat{\eta}_{1y} \rangle + \langle \vec(\ten{X}\circ\ten{X}), \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y} \rangle - b(\mat{\eta}_y)\right)
 \end{equation}
-The exponential family in \eqref{eq:quadratic-exp-fam} is easily generalizable to any order. This, though, would result in the number of parameters becoming prohibitive to estimate. This is also the reason why we opted for the second order exponential family in our formulation.
+The exponential family in \eqref{eq:quadratic-exp-fam} is easily generalizable to any order. This, though, would result in the number of parameters becoming prohibitive to estimate, which is also the reason why we opted for the second order exponential family in our formulation.
 
 By the equivalence in \eqref{eq:inverse-regression-sdr}, in order to find the sufficient reduction $\ten{R}(\ten{X})$ we need to infer $\mat{\eta}_{1y}$, and $\mat{\eta}_{2y}$. This is reminiscent of generalized linear modeling, which we extend to a multi-linear formulation next.
 Suppose $\ten{F}_y$ is a known mapping of $y$ with zero expectation $\E_Y\ten{F}_Y = 0$. We assume the dependence of $\ten{X}$ and $Y$ is reflected only in the first parameter and let
@@ -473,17 +381,20 @@ Suppose $\ten{F}_y$ is a known mapping of $y$ with zero expectation $\E_Y\ten{F}
     \mat{\eta}_{1y} &= \vec{\overline{\ten{\eta}}} + \mat{B}\vec\ten{F}_y, \label{eq:eta1-manifold} \\
     \mat{\eta}_{2}  &= \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}\vec(c\,\mat{\Omega}), \label{eq:eta2-manifold}
 \end{align}
-where $\overline{\ten{\eta}}\in\mathbb{R}^{p_1\times\ldots\times p_r}$, $\mat{\Omega} \in \mathbb{R}^{p \times p}$ is positive definite with $p = \prod_{j = 1}^{r} p_j$, and $c\in\mathbb{R}$ is a known constant determined by the distribution to ease modeling. That is, we assume that only $\mat{\eta}_{1y}$ depends on $Y$ through $\mat{B}$. The second parameter $\mat{\eta}_2$ captures the second order interaction structure of $\ten{X}$, which we assume not to depend on the response $Y$. In order to relate individual modes of $\ten{X}$ to the response, allowing flexibility in modeling, we assume $\ten{F}_y$ takes values in $\mathbb{R}^{q_1\times ...\times q_r}$; that is, $\ten{F}_y$ is a tensor valued independent variable in the inverse regression setting. This, in turn, leads to imposing corresponding tensor structure to the regression parameter $\mat{B}$. Thus, \eqref{eq:eta1-manifold} becomes
+where $\overline{\ten{\eta}}\in\mathbb{R}^{p_1\times\ldots\times p_r}$, $\mat{\Omega} \in \mathbb{R}^{p \times p}$ is positive definite with $p = \prod_{j = 1}^{r} p_j$, and $c\in\mathbb{R}$ is a known constant determined by the distribution to ease modeling. That is, we assume that only $\mat{\eta}_{1y}$ depends on $Y$ through $\mat{B}$. The second parameter $\mat{\eta}_2$ captures the second order interaction structure of $\ten{X}$, which we assume not to depend on the response $Y$. In order to relate individual modes of $\ten{X}$ to the response, allowing flexibility in modeling, we assume $\ten{F}_y$ takes values in $\mathbb{R}^{q_1\times ...\times q_r}$; that is, $\ten{F}_y$ is a tensor valued independent variable. This, in turn, leads to imposing corresponding tensor structure to the regression parameter $\mat{B}$. Thus, \eqref{eq:eta1-manifold} becomes
 \begin{align}
     \mat{\eta}_{1y} &=
     \vec\biggl(\overline{\ten{\eta}} + \ten{F}_y\mlm_{j = 1}^{r}\mat{\beta}_j\biggr), \label{eq:eta1}
 \end{align}
-where $\mat{B} = \bigotimes_{j = r}^{1}\mat{\beta}_j$ and the component matrices $\mat{\beta}_j\in\mathbb{R}^{p_j\times q_j}$ are of known rank for $j = 1, \ldots, r$. Given the high potential value of $p$, we further assume that
+where $\mat{B} = \bigotimes_{j = r}^{1}\mat{\beta}_j$ and the component matrices $\mat{\beta}_j\in\mathbb{R}^{p_j\times q_j}$ are of known rank for $j = 1, \ldots, r$. 
+
+As the bilinear form of the matrix normal requires its covariance be separable, the multilinear structure of $\ten{X}$ also induces separability on its covariance structure (see, e.g., \textcite{Hoff2011}). Therefore, we further assume that
 \begin{align}
-\t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y}= \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2} &= \vec\biggl(c\bigotimes_{j = r}^{1}\mat{\Omega}_j\biggr). \label{eq:eta2}
+\t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2y}= \t{(\mat{T}_2\pinv{\mat{D}_p})}\mat{\eta}_{2} &= \vec\biggl(c\bigotimes_{j = r}^{1}\mat{\Omega}_j\biggr), \label{eq:eta2}
 \end{align}
-where $\mat{\Omega}_j\in\mathbb{R}^{p_j\times p_j}$ are symmetric positive definite matrices for $j = 1, \ldots, r$. That is, we require $\mat{\Omega} = \bigotimes_{j = r}^{1}\mat{\Omega}_j$, which substantially reduces the number of parameters to estimate in $\mat{\Omega}$. The assumption that the $\mat{\Omega}_j$'s be positive definite is possible due to the constant $c$.
-Equation \eqref{eq:eta2} is underdetermined since $\t{(\mat{T}_2\pinv{\mat{D}_p})}$ has full column rank $d < p^2$ (with a non-strict inequality if $\ten{X}$ is univariate) but $\mat{\eta}_2$ is uniquely determined given any $\mat{\Omega}$ as $\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}$ has full row rank. We let $\mat{\xi} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})$ be a $p(p + 2 q + 3) / 2$-parameter vector and $\mat{\theta} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})$ be the constrained parameter vector, where $\mat{B}=\bigotimes_{j = r}^{1}\mat{\beta}_j$ and $\mat{\Omega} = \bigotimes_{j = r}^{1}\mat{\Omega}_j$. We also let $\Xi$ and $\Theta$ denote the unconstrained and constrained parameter spaces, with $\mat{\xi}$ and $\mat{\theta}$  varying in $\Xi$ and $\Theta$, respectively. The parameter space $\Xi$ is an open subset of $\mathbb{R}^{p(p + 2 q + 3) / 2}$ so that \eqref{eq:quadratic-exp-fam} is a proper density. Later, we relax the assumptions for $\mat{\beta}_k$ and $\mat{\Omega}_k$ as a consequence of \cref{thm:param-manifold} in  \cref{sec:kron-manifolds}.
+where $\mat{\Omega}_j\in\mathbb{R}^{p_j\times p_j}$ are symmetric positive definite matrices for $j = 1, \ldots, r$. Requiring $\mat{\Omega} = \bigotimes_{j = r}^{1}\mat{\Omega}_j$ substantially reduces the number of parameters to estimate in $\mat{\Omega}$. The assumption that the $\mat{\Omega}_j$'s be positive definite is possible due to the constant $c$.
+
+Equation \eqref{eq:eta2} is underdetermined since $\t{(\mat{T}_2\pinv{\mat{D}_p})}$ has full column rank $d < p^2$ (with a non-strict inequality if $\ten{X}$ is univariate) but $\mat{\eta}_2$ is uniquely determined given any $\mat{\Omega}$ as $\t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})})}$ has full row rank. We let $\mat{\xi} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})$ be the unconstrained $p(p + 2 q + 3) / 2$-parameter vector and $\mat{\theta} = (\vec{\overline{\ten{\eta}}}, \vec{\mat{B}}, \vech{\mat{\Omega}})$ be the constrained parameter vector, where $\mat{B}=\bigotimes_{j = r}^{1}\mat{\beta}_j$ and $\mat{\Omega} = \bigotimes_{j = r}^{1}\mat{\Omega}_j$. We also let $\Xi$ and $\Theta$ denote the unconstrained and constrained parameter spaces, with $\mat{\xi}$ and $\mat{\theta}$  varying in $\Xi$ and $\Theta$, respectively. The parameter space $\Xi$ is an open subset of $\mathbb{R}^{p(p + 2 q + 3) / 2}$ so that \eqref{eq:quadratic-exp-fam} is a proper density. We relax the assumptions for $\mat{\beta}_k$ and $\mat{\Omega}_k$ later as a consequence of \cref{thm:param-manifold} in  \cref{sec:kron-manifolds}.
 
 % \todo{Maybe already here introduce the ``constraint'' set of $\Omega$'s allowed as $\{ \Omega\in\mathbb{R}_{++}^{p\times p} : \vec{\Omega} = \t{(\pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p})}\vec{\Omega} \}$}
 
@@ -493,7 +404,6 @@ In a classical \emph{generalized linear model} (GLM), the link function connecti
     \ten{g}_2(\mat{\eta}_y) &= \E[\ten{X}\circ\ten{X} \mid Y = y] \label{eq:inv-link2}
 \end{align}
 as $\widetilde{\mat{g}}(\mat{\eta}_y) = (\vec\ten{g}_1(\mat{\eta}_y), \vec\ten{g}_2(\mat{\eta}_y))$.
-
 Under the quadratic exponential family model \eqref{eq:quadratic-exp-fam},  a sufficient reduction for the regression of $Y$ on $\ten{X}$ is given in \cref{thm:sdr}.
 \begin{theorem}[SDR]\label{thm:sdr}
     A sufficient reduction for the regression $Y\mid \ten{X}$ under the quadratic exponential family inverse regression model \eqref{eq:quadratic-exp-fam} with natural parameters \eqref{eq:eta1} and \eqref{eq:eta2} is given by
@@ -574,15 +484,16 @@ Although the general case of any GMLM model can be fitted via gradient descent u
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Tensor Normal}\label{sec:tensor_normal_estimation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
 Suppose $\ten{X}\mid Y = y$ follows a tensor normal distribution  with mean $\ten{\mu}_y$ and covariance $\mat{\Sigma} = \bigkron_{k = r}^{1}\mat{\Sigma}_k$. We assume the distribution is non-degenerate which means that the covariances $\mat{\Sigma}_k$ are symmetric positive definite matrices. Its density is given by
 \begin{displaymath}
     f_{\mat{\theta}}(\ten{X}\mid Y = y) = (2\pi)^{-p / 2}\prod_{k = 1}^{r}\det(\mat{\Sigma}_k)^{-p / 2 p_k}\exp\left( -\frac{1}{2}\left\langle\ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\mlm_{k = 1}^{r}\mat{\Sigma}_k^{-1} \right\rangle \right).
 \end{displaymath}
 For the sake of simplicity and w.l.o.g., we assume $\ten{X}$ has 0 marginal expectation; i.e., $\E\ten{X} = 0$. Rewriting this in the quadratic exponential family form \eqref{eq:quadratic-exp-fam}, determines the scaling constant $c = -1/2$.  The relation to the GMLM parameters $\overline{\ten{\eta}}, \mat{\beta}_k$ and $\mat{\Omega}_k$, for $k = 1, \ldots, r$ is
-\begin{displaymath}
+\begin{equation}\label{eq:tnormal_cond_params}
     \ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k, \qquad
     \mat{\Omega}_k = \mat{\Sigma}_k^{-1},
-\end{displaymath}
+\end{equation}
 where we used that $\overline{\ten{\eta}} = 0$ due to $0 = \E\ten{X} = \E\E[\ten{X}\mid Y] = \E\ten{\mu}_Y$ in combination with $\E\ten{F}_Y = 0$. Additionally, all the $\mat{\Omega}_k$'s are symmetric positive definite, because the $\mat{\Sigma}_k$'s are. This lead to another simplification since then $\mat{T}_2$ in \eqref{eq:t-stat}  equals the identity. This also means that the gradients of the log-likelihood $l_n$ in \cref{thm:grad} are simpler. We obtain
 \begin{displaymath}
     \ten{g}_1(\mat{\eta}_y) = \E[\ten{X}\mid Y = y] = \ten{\mu}_y, \qquad
@@ -644,31 +555,63 @@ A technical detail for numerical stability is to ensure that the scaled values $
 
 Furthermore, if the parameter space follows a more general setting as in \cref{thm:param-manifold}, updating may produces estimates outside  the parameter space. A simple and efficient method is to project every updated estimate onto the corresponding manifold.
 
+A standard method to compute the MLE of a Kronecker product is block-coordinate descent, also
+referred to as the ``flip-flop algorithm.'' This algorithm was proposed independently by \textcite{MardiaGoodall1993} and \textcite{Dutilleul1999} and was later called ``flip-flop'' algorithm by \textcite{LuZimmerman2005} for the computation of the maximum likelihood estimators of the components of a separable covariance matrix. \textcite{ManceurDutilleul2013} extended the ``flip-flop'' algorithm for the computation of the MLE of the separable covariance structure of a 3-way and 4-way normal distribution and obtained a lower bound for the sample size required for its existence. The same issue was also studied by \textcite{DrtonEtAl2020} in the case of a two-way array (matrix).
+
+\efi{here you need to explain the difference of yours with the flip-flop}
+This is (basically) the same algorithm as I use for the tensor normal except that I estimate the precision matrices $\mat{\Omega}_k = \mat{\Sigma}_k^{-1}$. 
+
+
+
+\subsubsection{Matrix and Tensor Normal}
+
+The tensor normal model is the extension of the matrix normal to tensor-valued random variables and a member of the quadratic exponential family \eqref{eq:quadratic-exp-fam} under \eqref{eq:eta2}. \textcite{Dawid1981,Arnold1981} introduced the term matrix normal and, in particular, \textcite{Arnold1981} provided several theoretical results, such as its density, moments and conditional distributions of its components. The matrix normal distribution is  a bilinear normal distribution; a distribution of a two-way array, each component
+representing a vector of observations \parencite{OhlsonEtAl2013}. \textcite{KolloVonRosen2005,Hoff2011,OhlsonEtAl2013} presented the extension of the bilinear to the multilinear normal distribution, what we call tensor normal in this paper, using a parallel extension of bilinear matrices to multilinear tensors \parencite{Comon2009}. 
+
+The defining feature of the matrix normal distribution, and its tensor extension, is the Kronecker product structure of its covariance. This formulation, where the covariates are multivariate normal with multiway covariance structure modeled as a Kronecker
+product of matrices of much lower dimension, aims to overcome the significant modeling and computational challenges arising from the
+high computational complexity of manipulating tensor representations \parencite[see, e.g.,][]{HillarLim2013,WangEtAl2022}. 
+
+Multilinear tensor normal %Kronecker-separable covariance 
+models have been used in various applications, including
+medical imaging \parencite{BasserPajevic2007,DrydenEtAl2009}, spatio-temporal data analysis \parencite{GreenewaldHero2014}, regression analysis
+for longitudinal relational data \parencite{Hoff2015}.
+%, radar [AFJ10], and multiple-input-multiple-output (MIMO) communications [WJS08]. 
+One of the most important uses of the multilinear normal (MLN) distribution, and hence tensor analysis, is perhaps in magnetic resonance imaging (MRI)  \parencite{OhlsonEtAl2013}.
+A recent survey \parencite{WangEtAl2022} and references therein contain more information and potential applications of multilinear tensor normal models.
+%The first occurrence of the \textit{matrix normal} we found, even though not explicitly called as such, was in \textcite{SrivastavaKhatri1979}. 
+
+
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Ising Model}\label{sec:ising_estimation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-The general distribution of a binary vector is modeled by the \emph{multi-variate Bernoulli distribution} (\textcite{GraphicalModels-Whittaker2009, MVB-Dai2012, MVB-DaiDingWahba2013}). The \emph{Ising model} \textcite{Ising-Ising1924} is a special case, considering only two-way interactions. Its probability mass function (PMF) for a binary random vector $X\in\{ 0, 1 \}^p$ with natural parameters $\mat{\gamma}\in\mathbb{R}^{p(p + 1) / 2}$ is given by
+The Ising\footnote{Also known as the \emph{Lenz-Ising} model as the physical assumptions of the model where developed by both Lenz and Ising \parencite{Niss2005}. Ising gave a closed form solution for the 1-dimensional lattice, that is, a linear chain \parencite{Ising1925}.} model \parencite{Lenz1920,Ising1925,Niss2005} is a mathematical model originating in statistical physics to study ferromagnetism in a thermodynamic setting. It describes magentic dipoles (atomic ``spins'') which can take two states ($\pm 1$) while allowing two-way interactions between direct neighbours on a lattice, a discrete grid. The model assumes all elementary magnets to be the same, which translates to all having the same coupling strength (two-way interactions) governed by a single parameter relating to the temperature of the system. Nowadays, the Ising model, in its general form, allows for different coupling strength for every (symmetric) interaction as well as an external magnetic field acting on every magnetic dipole separately. A modern review is given by \textcite{NguyenEtAl2017}.
+
+In statistics, the Ising model is used to model multivariate binary data. That is, the states are ${0, 1}$ instead of $\pm 1$. It is related to a multitude of other models; \emph{Graphical Models} and \emph{Markov Random Fields} to describe conditional dependence \parencite{Lauritzen1996,WainwrightJordan2008,LauritzenRichardson2002}, \emph{Potts models} \parencite{Besag1974,ChakrabortyEtAl2022} which generalize the Ising model to multiple states, the \emph{multivariate Bernoulli distribution} \parencite{Whittaker1990,JohnsonEtAl1997,DaiDingWahba2013} considering also interactions (tree-way and higher), to give the most prominent. 
+
+The $p$-dimensional Ising model is a discrete probability distribution on the set of $p$-dimensional binary vectors $\mat{x}\in\{0, 1\}^p$ with pmf given by
 \begin{displaymath}
     P_{\mat{\gamma}}(\mat{x}) = p_0(\mat{\gamma})\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}).
 \end{displaymath}
-The scaling factor $p_0(\mat{\gamma})\in\mathbb{R}_{+}$ ensures that $P_{\mat{\gamma}}$ is a PMF. It is equal to the probability of the zero event $P(X = \mat{0}) = p_0(\mat{\gamma})$. More commonly known as the \emph{partition function}, the reciprocal of $p_0$, is given by
+The scaling factor $p_0(\mat{\gamma})\in\mathbb{R}_{+}$ ensures that $P_{\mat{\gamma}}$ is a pmf. It is equal to the probability of the zero event $P(X = \mat{0}) = p_0(\mat{\gamma})$. More commonly known as the \emph{partition function}, the reciprocal of $p_0$, is given by
 \begin{equation}\label{eq:ising-partition-function}
     p_0(\mat{\gamma})^{-1} = \sum_{\mat{x}\in\{0, 1\}^p}\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}).
 \end{equation}
-By an abuse of notation, let $\mat{\gamma}_{j l}$ denote the element of $\mat{\gamma}$ corresponding to $\mat{x}_j\mat{x}_l$ in $\vech(\mat{x}\t{\mat{x}})$\footnote{Specifically, the element $\mat{\gamma}_{j l}$ of $\mat{\gamma}$ is a short hand for $\mat{\gamma}_{\iota(j, l)}$ with $\iota(j, l) = (\min(j, l) - 1)(2 p - \min(j, l)) / 2 + \max(j, l)$ mapping the matrix row index $j$ and column index $l$ to the corresponding half vectorization indices $\iota(j, l)$.}. The ``diagonal'' parameter $\mat{\gamma}_{j j}$ expresses the conditional log odds of $X_j = 1\mid X_{-j} = \mat{0}$, where the negative subscript in $X_{-j}$ describes the $p - 1$ dimensional vector $X$ with the $j$th element removed. The ``off diagonal'' parameters $\mat{\gamma}_{j l}$, for $j\neq l$, are equal to the conditional log odds of simultanious occurence $X_j = 1, X_l = 1 \mid X_{-j, -l} = \mat{0}$. More precise, for $j\neq l$, the conditional probabitities and the natural parameters are related by
+By an abuse of notation, we let $\mat{\gamma}_{j l}$ denote the element of $\mat{\gamma}$ corresponding to $\mat{x}_j\mat{x}_l$ in $\vech(\mat{x}\t{\mat{x}})$\footnote{Specifically, the element $\mat{\gamma}_{j l}$ of $\mat{\gamma}$ is a short hand for $\mat{\gamma}_{\iota(j, l)}$ with $\iota(j, l) = (\min(j, l) - 1)(2 p - \min(j, l)) / 2 + \max(j, l)$ mapping the matrix row index $j$ and column index $l$ to the corresponding half vectorization indices $\iota(j, l)$.}. The ``diagonal'' parameter $\mat{\gamma}_{j j}$ expresses the conditional log odds of $X_j = 1\mid X_{-j} = \mat{0}$, where the negative subscript in $X_{-j}$ describes the $p - 1$ dimensional vector $X$ with the $j$th element removed. The off diagonal entries $\mat{\gamma}_{j l}$, $j\neq l$, are equal to the conditional log odds of simultaneous occurrence $X_j = 1, X_l = 1 \mid X_{-j, -l} = \mat{0}$. More precisely, the conditional probabilities and the natural parameters are related via
 \begin{align}
     \mat{\gamma}_{j j} &= \log\frac{P_{\mat{\gamma}}(X_j = 1\mid X_{-j} = \mat{0})}{1 - P_{\mat{\gamma}}(X_j = 1\mid X_{-j} = \mat{0})}, \nonumber \\
     \mat{\gamma}_{j l} &= \log\frac{1 - P_{\mat{\gamma}}(X_j = 1\mid X_{-j} = \mat{0})P_{\mat{\gamma}}(X_l = 1\mid X_{-l} = \mat{0})}{P_{\mat{\gamma}}(X_j = 1\mid X_{-j} = \mat{0})P_{\mat{\gamma}}(X_l = 1\mid X_{-l} = \mat{0})}\frac{P_{\mat{\gamma}}(X_j = 1, X_l = 1\mid X_{-j, -l} = \mat{0})}{1 - P_{\mat{\gamma}}(X_j = 1, X_l = 1\mid X_{-j, -l} = \mat{0})} \label{eq:ising-two-way-log-odds}.
 \end{align}
-Conditional Ising models, incorporating the information of covariates $Y$ into the model, have also been considered \textcite{sparseIsing-ChengEtAt2014, sdr-mixedPredictors-BuraForzaniEtAl2022}. The direct way is to parameterize $\mat{\gamma} = \mat{\gamma}_y$ by the covariate $Y = y$ to model a conditional distribution $P_{\mat{\gamma}_y}(\mat{x}\mid Y = y)$.
+Conditional Ising models, incorporating the information of covariates $Y$ into the model, were considered by \textcite{ChengEtAl2014,BuraEtAl2022}. The direct way is to parameterize $\mat{\gamma} = \mat{\gamma}_y$ by the covariate $Y = y$ to model a conditional distribution $P_{\mat{\gamma}_y}(\mat{x}\mid Y = y)$.
 
-We extend the conditional PMF by allowing the binary variables to be tensor values, that is for $\ten{X}\in\{ 0, 1 \}^{p_1\times\cdots\times p_r}$ we set $\mat{x} = \vec{\ten{X}}$, with dimension $p = \prod_{k = 1}^{r}p_k$. Considering the tensor structure of $\ten{X}$ is done by assuming Kronecker product constraints to the parameter vector $\mat{\gamma}_y$ in a similar fashion as for the tensor normal model. This means that we compare the PMF $P_{\mat{\gamma}_y}(\vec{\ten{X}} | Y = y)$ to the quadratic exponential family \eqref{eq:quadratic-exp-fam} with the natural parameters modeled by \eqref{eq:eta1} and \eqref{eq:eta2}. A detail to be considered is that the diagonal of $(\vec{\ten{X}})\t{(\vec{\ten{X}})}$ is equal to $\vec{\ten{X}}$. This gives the GMLM model as
+We extend the conditional pmf by allowing the binary variables to be tensor valued; that is,  we set $\mat{x} = \vec{\ten{X}}$, with dimension $p = \prod_{k = 1}^{r}p_k$ for $\ten{X}\in\{ 0, 1 \}^{p_1\times\cdots\times p_r}$. The tensor structure of $\ten{X}$ is accommodated  by assuming Kronecker product constraints to the parameter vector $\mat{\gamma}_y$ in a similar fashion as for the tensor normal model. This means that we compare the pmf $P_{\mat{\gamma}_y}(\vec{\ten{X}} | Y = y)$ with the quadratic exponential family \eqref{eq:quadratic-exp-fam} with the natural parameters modeled by \eqref{eq:eta1} and \eqref{eq:eta2}. A detail to be considered is that the diagonal of $(\vec{\ten{X}})\t{(\vec{\ten{X}})}$ is equal to $\vec{\ten{X}}$. This gives the GMLM model as
 \begin{align}
     P_{\mat{\gamma}_y}(\ten{X} \mid Y = y)
         &= p_0(\mat{\gamma}_y)\exp(\t{\vech((\vec{\ten{X}})\t{(\vec{\ten{X}})})}\mat{\gamma}_y) \nonumber \\
         &= p_0(\mat{\gamma}_y)\exp\Bigl(\Bigl\langle \ten{X}, \ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k \Bigr\rangle + \Bigl\langle\ten{X}\mlm_{k = 1}^{r}\mat{\Omega}_k, \ten{X}\Bigr\rangle\Bigr)\label{eq:ising-cond-prob}
 \end{align}
-where we set $\overline{\ten{\eta}} = 0$ and $\mat{T}_2$ to the identity. This is an additional constraint to the model, the reason is that the diagonal elements of $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$ take the role of $\overline{\ten{\eta}}$, althoug not fully. Having the diagonal of $\mat{\Omega}$ and $\overline{\ten{\eta}}$ handling the self interaction effects might lead to interference in the optimization routine. Another approach would be to use the $\mat{T}_2$ matrix to set the corresponding diagonal elements of $\mat{\Omega}$ to zero and let $\overline{\ten{\eta}}$ handle the self interaction effect. All of those approaches, namely setting $\overline{\ten{\eta}} = 0$, keeping $\overline{\ten{\eta}}$ or using $\mat{T}_2$, are theoretically solid and compatible with \cref{thm:grad,thm:param-manifold,thm:asymptotic-normality-gmlm}, assuming all axis dimensions $p_k$ are non-degenerate, that is $p_k > 1$ for all $k = 1, \ldots, r$. Regardles, under our modeling choise we get the relation between the natural parameters $\mat{\gamma}_y$ of the conditional Ising model and the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$ as
+where we set $\overline{\ten{\eta}} = 0$ and $\mat{T}_2$ to the identity. This imposes an additional constraint to the model, the reason is that the diagonal elements of $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$ take the role of $\overline{\ten{\eta}}$, although not fully. Having the diagonal of $\mat{\Omega}$ and $\overline{\ten{\eta}}$ handling the self interaction effects might lead to interference in the optimization routine. Another approach would be to use the $\mat{T}_2$ matrix to set the corresponding diagonal elements of $\mat{\Omega}$ to zero and let $\overline{\ten{\eta}}$ handle the self interaction effect. All of those approaches, namely setting $\overline{\ten{\eta}} = 0$, keeping $\overline{\ten{\eta}}$ or using $\mat{T}_2$, are theoretically solid and compatible with \cref{thm:grad,thm:param-manifold,thm:asymptotic-normality-gmlm}, assuming all axis dimensions $p_k$ are non-degenerate, that is $p_k > 1$ for all $k = 1, \ldots, r$. Regardless, under our modeling choice the relation between the natural parameters $\mat{\gamma}_y$ of the conditional Ising model and the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$ is
 \begin{equation}\label{eq:ising-natural-params}
     % \t{\pinv{\mat{D}_p}}\mat{\gamma}_y
     %     = \vec(\mat{\Omega} + \diag(\mat{B}\vec{\ten{F}_y}))
@@ -677,19 +620,18 @@ where we set $\overline{\ten{\eta}} = 0$ and $\mat{T}_2$ to the identity. This i
         = \t{\mat{D}_p}\vec(\mat{\Omega} + \diag(\mat{B}\vec{\ten{F}_y}))
         = \t{\mat{D}_p}\vec\Biggl(\bigkron_{k = r}^{1}\mat{\Omega}_k + \diag\biggl(\vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr)\biggr)\Biggr).
 \end{equation}
-
-In contract to the tensor normal GMLM, the matrices $\mat{\Omega}_k$ are only required to be symmetric. More specificaly, we require $\mat{\Omega}_k$, for $k = 1, \ldots, r$, to be elements of an embedded submanifold of $\SymMat{p_k}$ (see: \cref{sec:kron-manifolds,sec:matrix-manifolds}). The mode wise reduction matrices $\mat{\beta}_k$ need to be elements of an embedded submanifold of $\mathbb{R}^{p_k\times q_k}$. Common choises are listed in \cref{sec:matrix-manifolds}. \todo{check if we need to exclude zero here!}
+In contract to the tensor normal GMLM, the matrices $\mat{\Omega}_k$ are only required to be symmetric. More specifically, we require $\mat{\Omega}_k$, for $k = 1, \ldots, r$, to be elements of an embedded submanifold of $\SymMat{p_k}$ (see: \cref{sec:kron-manifolds,sec:matrix-manifolds}). The mode wise reduction matrices $\mat{\beta}_k$ are elements of an embedded submanifold of $\mathbb{R}^{p_k\times q_k}$. Common choices are listed in \cref{sec:matrix-manifolds}.
 
 To solve the optimization problem \eqref{eq:mle}, given a data set $(\ten{X}_i, y_i)$, for $i = 1, \ldots, n$, we use a variation of gradient descent.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsubsection{Initial Values}
 
-The first step is to get reasonable starting values. Experiments showed that a good starting value of $\mat{\beta}_k$, for $k = 1, \ldots, r$, it to use the tensor normal estimates from \cref{sec:tensor_normal_estimation}, interprating $\ten{X}_i$ as continuous. For initial values of $\mat{\Omega}_k$, a different approach is required. Setting everything to the uninformed initial value, that is $\mat{\Omega}_k = \mat{0}$ as this corresponds to the conditional log odds to be $1:1$ for every component and pairwaide interaction. This is not possible, since $\mat{0}$ is a stationary point of the log-likelihood. This is directly observed by taking a look at the partial gradients of the log-likelihood in \cref{thm:grad}. Instead, we use a crude heuristic which threads every mode seperately and ignores any relation to the covariates. It is computationaly cheap and better than any of the alternatives we considered. For every $k = 1, \ldots, r$, let the $k$'th mode second moment estimate be
+The first step is to get reasonable starting values. Experiments showed that a good starting value of $\mat{\beta}_k$, for $k = 1, \ldots, r$, it to use the tensor normal estimates from \cref{sec:tensor_normal_estimation}, considering $\ten{X}_i$ as continuous. For initial values of $\mat{\Omega}_k$, a different approach is required. Setting everything to the uninformed initial value, that is $\mat{\Omega}_k = \mat{0}$ as this corresponds to the conditional log odds to be $1:1$ for every component and pairwaide interaction. This is not possible, since $\mat{0}$ is a stationary point of the log-likelihood. This is directly observed by taking a look at the partial gradients of the log-likelihood in \cref{thm:grad}. Instead, we use a crude heuristic which threads every mode seperately and ignores any relation to the covariates. It is computationaly cheap and better than any of the alternatives we considered. For every $k = 1, \ldots, r$, let the $k$'th mode second moment estimate be
 \begin{equation}\label{eq:ising-mode-moments}
     \hat{\mat{M}}_{2(k)} = \frac{p_k}{n p}\sum_{i = 1}^n (\ten{X}_i)_{(k)}\t{(\ten{X}_i)_{(k)}}
 \end{equation}
-which contains the $k$'th mode first moment estimate in its diagonal $\hat{\mat{M}}_{1(k)} = \diag\hat{\mat{M}}_{2(k)}$. Considering every column of the matricized observation $(\ten{X}_i)_{(k)}$ as a $p_k$ dimensional observation itself. The number of those artifically generated observations is $n \prod_{j\neq k}p_j$. Let $Z_k$ denote the random variable those artifical observations are realization of. Then, we can interprate the elements $(\hat{\mat{M}}_{1(k)})_{j}$ as the estimates of the probability $P((Z_k)_j = 1)$, that is the marginal probability of the $j$th element of $Z_k$ being $1$. Similar, for $l \neq j$ we have $(\hat{\mat{M}}_{2(k)})_{j l}$ estimating $P((Z_k)_j = 1, (Z_k)_l = 1)$, the marginal probability of two-way interactions. % Without any regard of accuracy ...
+which contains the $k$'th mode first moment estimate in its diagonal $\hat{\mat{M}}_{1(k)} = \diag\hat{\mat{M}}_{2(k)}$. Considering every column of the matricized observation $(\ten{X}_i)_{(k)}$ as a $p_k$ dimensional observation itself. The number of those artificially generated observations is $n \prod_{j\neq k}p_j$. Let $Z_k$ denote the random variable those artificial observations are realization of. Then, we can interpret the elements $(\hat{\mat{M}}_{1(k)})_{j}$ as the estimates of the probability $P((Z_k)_j = 1)$, that is the marginal probability of the $j$th element of $Z_k$ being $1$. Similar, for $l \neq j$ we have $(\hat{\mat{M}}_{2(k)})_{j l}$ estimating $P((Z_k)_j = 1, (Z_k)_l = 1)$, the marginal probability of two-way interactions. % Without any regard of accuracy ...
 Now, we set the diagonal elements of $\mat{\Omega}_k$ to zero. For the off diagonal elements of $\mat{\Omega}_k$, we equate the conditional probabilities $P((Z_k)_j = 1 \mid (Z_k)_{-j} = \mat{0})$ and $P((Z_k)_j = 1, (Z_k)_l = 1\mid (Z_k)_{-j, -l} = \mat{0})$ with the marginal probability estimates $(\hat{\mat{M}}_{1(k)})_{j}$ and $(\hat{\mat{M}}_{2(k)})_{j l}$, respectively. Use \eqref{eq:ising-two-way-log-odds} then gives the initial estimates $\hat{\mat{\Omega}}_k^{(0)}$, with $j \neq l$ component wise
 \begin{equation}\label{eq:ising-init-Omegas}
     (\hat{\mat{\Omega}}_k^{(0)})_{j j} = 0,
@@ -699,30 +641,30 @@ Now, we set the diagonal elements of $\mat{\Omega}_k$ to zero. For the off diago
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsubsection{Gradient Optimization}
-Given initial values, the gradients provided by \cref{thm:grad} can be evaluated for the Ising model. The first step therefore is to determin the values of the inverse link components $\ten{g}_1(\mat{\gamma}_y) = \E[\ten{X} \mid Y = y]$ and $\ten{G}_2(\mat{\gamma}_y) = \ten{g}_2(\mat{\gamma}_y) = \E[\ten{X}\circ\ten{X} \mid Y = y]$. An imediate simplification is that the first moment is a part of the second moment. Its values are determined via $\vec(\E[\ten{X} \mid Y = y]) = \diag(\E[\ten{X}\circ\ten{X} \mid Y = y]_{(1, \ldots, r)})$. This means only the second moment needs to be computed, or estimated (see: \cref{sec:ising-bigger-dim}) in the case of slightly bigger $p$. For the Ising model, the conditional second moment with parameters $\mat{\gamma}_y$ is given by the matricized relation
+Given initial values, the gradients provided by \cref{thm:grad} can be evaluated for the Ising model. The first step therefore is to determine the values of the inverse link components $\ten{g}_1(\mat{\gamma}_y) = \E[\ten{X} \mid Y = y]$ and $\ten{G}_2(\mat{\gamma}_y) = \ten{g}_2(\mat{\gamma}_y) = \E[\ten{X}\circ\ten{X} \mid Y = y]$. An immediate simplification is that the first moment is a part of the second moment. Its values are determined via $\vec(\E[\ten{X} \mid Y = y]) = \diag(\E[\ten{X}\circ\ten{X} \mid Y = y]_{(1, \ldots, r)})$. This means only the second moment needs to be computed, or estimated (see: \cref{sec:ising-bigger-dim}) in the case of slightly bigger $p$. For the Ising model, the conditional second moment with parameters $\mat{\gamma}_y$ is given by the matricized relation
 \begin{equation}\label{eq:ising-m2}
     \ten{g}_2(\ten{\gamma}_y)_{(1, \ldots, r)} = \E[(\vec{\ten{X}})\t{(\vec{\ten{X}})}\mid Y = y] = p_0(\mat{\gamma}_y)\sum_{\mat{x}\in\{0, 1\}^{p}}\mat{x}\t{\mat{x}}\exp(\t{\vech(\mat{x}\t{\mat{x}})}\mat{\gamma}_y).
 \end{equation}
-The natural parameter $\mat{\gamma}_y$ is evaluated via \eqref{eq:ising-natural-params} enabeling us to compute the partial gradients of the log-likelihood $l_n$ \eqref{eq:log-likelihood} for the Ising model by \cref{thm:grad} for the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$, $k = 1, \ldots, r$, at the current iterate $\mat{\theta}^{(I)} = (\mat{\beta}_1^{(I)}, \ldots, \mat{\beta}_r^{(I)}, \mat{\Omega}_1^{(I)}, \ldots, \mat{\Omega}_r^{(I)})$. Using classic gradient ascent for maximizing the log-likelihood, we have to specify a learning rate $\lambda\in\mathbb{R}_{+}$, usualy something like $10^{-3}$. The update rule is
+The natural parameter $\mat{\gamma}_y$ is evaluated via \eqref{eq:ising-natural-params} enabling us to compute the partial gradients of the log-likelihood $l_n$ \eqref{eq:log-likelihood} for the Ising model by \cref{thm:grad} for the GMLM parameters $\mat{\beta}_k$ and $\mat{\Omega}_k$, $k = 1, \ldots, r$, at the current iterate $\mat{\theta}^{(I)} = (\mat{\beta}_1^{(I)}, \ldots, \mat{\beta}_r^{(I)}, \mat{\Omega}_1^{(I)}, \ldots, \mat{\Omega}_r^{(I)})$. Using classic gradient ascent for maximizing the log-likelihood, we have to specify a learning rate $\lambda\in\mathbb{R}_{+}$, usualy something like $10^{-3}$. The update rule is
 \begin{displaymath}
     \mat{\theta}^{(I + 1)} = \mat{\theta}^{(I)} + \lambda\nabla_{\mat{\theta}} l_n(\mat{\theta})\bigr|_{\mat{\theta} = \mat{\theta}^{(I)}},
 \end{displaymath}
-which is iterated till convergence. In practice, iteration is performed until ether a maximum number of iterations is exhausted and/or some break condition is satisfied. A proper choise of the learning rate is needed as a too large learning rate $\lambda$ causes instabilities, while a too low learning rate requires an enourmes ammount of iterations. Generically, there are two approach against the need to determine a proper lerning rate. First, \emph{line search methods} determin an appropriate step size for every iteration. This works great if the evaluation of the object function (the log-likelihood) is cheap. This is not the case in our setting, see \cref{sec:ising-bigger-dim}. The second approach is an \emph{addaptive learning rate}. The basic idea is to track specific statistics while optimizing and dynamiclly addapt the leaning rate via well tested heuristics using the gathered knowledge from past iterations. We opted to use an addaptive leaning rate approach, this not only levaites the need to determin an approriate leaning rate, but also excelerates learning.
+which is iterated till convergence. In practice, iteration is performed until ether a maximum number of iterations is exhausted and/or some break condition is satisfied. A proper choice of the learning rate is needed as a too large learning rate $\lambda$ causes instabilities, while a too low learning rate requires an enormous amount of iterations. Generically, there are two approach against the need to determine a proper learning rate. First, \emph{line search methods} determin an appropriate step size for every iteration. This works great if the evaluation of the object function (the log-likelihood) is cheap. This is not the case in our setting, see \cref{sec:ising-bigger-dim}. The second approach is an \emph{adaptive learning rate}. The basic idea is to track specific statistics while optimizing and dynamically adapt the leaning rate via well tested heuristics using the gathered knowledge from past iterations. We opted to use an adaptive leaning rate approach, this not only removes the need to determine an appropriate leaning rate, but also  accelerates learning.
 
-Our method of choise is RMSprop, which stands for \emph{root mean squared propagation} \textcite{rmsprop-Hinton2012}. This is a well known method in maschine learning for training neural networks. Its a variation of gradient descent with an per scalar parameter addaptive learning rate. It tracks a moving average of the element wise squared gradient $\mat{g}_2^{(I)}$, which is then used to scale (element wise) the gradient in the update rule. See \textcite{rmsprop-Hinton2012,deeplearningbook-GoodfellowEtAl2016} among others. The update rule using RMSprop for maximization\footnote{Instead of the more common minimization, therefore $+$ in the update of $\mat{\theta}$.} is
+Our method of choice is RMSprop, which stands for \emph{root mean squared propagation} \textcite{Hinton2012}. This is a well known method in machine learning for training neural networks. It is a variation of gradient descent with a per scalar parameter adaptive learning rate. It tracks a moving average of the element wise squared gradient $\mat{g}_2^{(I)}$, which is then used to scale (element wise) the gradient in the update rule. See \textcite{Hinton2012,GoodfellowEtAl2016} among others. The update rule using RMSprop for maximization\footnote{Instead of the more common minimization, therefore $+$ in the update of $\mat{\theta}$.} is
 \begin{align*}
     \mat{g}_2^{(I + 1)} &= \nu \mat{g}_2^{(I)} + (1 - \nu)\nabla l_n(\mat{\theta}^{(I)})\odot\nabla l_n(\mat{\theta}^{(I)}), \\
     \mat{\theta}^{(I + 1)} &= \mat{\theta}^{(I)} + \frac{\lambda}{\sqrt{\mat{g}_2^{(I + 1)}} + \epsilon}\odot\nabla l_n(\mat{\theta}^{(I)}).
 \end{align*}
-The parameters $\nu = 0.9$, $\lambda = 10^{-3}$ and $\epsilon\approx 1.49\cdot 10^{-8}$ are fixed. The initial value of $\mat{g}_2^{(0)} = \mat{0}$, the symbol $\odot$ denotes the Hadamard product, that is the element wise multiplication. The divition and sqaure root operation are performed element wise as well. According to our experiments, RMSprop requires in the range of $50$ till $1000$ iterations till convergence while gradient ascent with a learning rate of $10^{-3}$ is in the range of $1000$ till $10000$. \todo{check this!}
+The parameters $\nu = 0.9$, $\lambda = 10^{-3}$ and $\epsilon\approx 1.49\cdot 10^{-8}$ are fixed. The initial value of $\mat{g}_2^{(0)} = \mat{0}$, the symbol $\odot$ denotes the Hadamard product, that is the element wise multiplication. The division and square root operation are performed element wise as well. According to our experiments, RMSprop requires in the range of $50$ till $1000$ iterations till convergence while gradient ascent with a learning rate of $10^{-3}$ is in the range of $1000$ till $10000$.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsubsection{Small Data Sets}\label{sec:ising-small-data-sets}
-In case of a finite number of observations, specifically in data sets with a small number of observations $n$, the situation where one components is always ether zero or one can occure. Its also possible to observe two exclusive components. This situation of a ``degenerate'' data set needs to be saveguarded against in practive. Working with parameters on a log-scale, this gives estimates of $\pm\infty$. This is outside of the parameter space and breaks our optimization algorithm.
+In case of a finite number of observations, specifically in data sets with a small number of observations $n$, the situation where one components is always ether zero or one can occur. Its also possible to observe two exclusive components. This situation of a ``degenerate'' data set needs to be safeguarded against in practice. Working with parameters on a log-scale, this gives estimates of $\pm\infty$. This is outside of the parameter space and breaks our optimization algorithm.
 
-The first situation where this needs to be addressed is in \eqref{eq:ising-init-Omegas}, where we set initial estimates for $\mat{\Omega}_k$. To avoid divition by zero as well as evaluating the log of zero, we addapt \eqref{eq:ising-mode-moments}, the mode wise moment estimates $\hat{\mat{M}}_{2(k)}$. A simple method is to replace the ``degenerate'' components, that are entries with value $0$ or $1$, with the smallest positive estimate of exactly one occurence $p_k / n p$, or all but one occurence $1 - p_k / n p$, respectively.
+The first situation where this needs to be addressed is in \eqref{eq:ising-init-Omegas}, where we set initial estimates for $\mat{\Omega}_k$. To avoid divition by zero as well as evaluating the log of zero, we addapt \eqref{eq:ising-mode-moments}, the mode wise moment estimates $\hat{\mat{M}}_{2(k)}$. A simple method is to replace the ``degenerate'' components, that are entries with value $0$ or $1$, with the smallest positive estimate of exactly one occurrence $p_k / n p$, or all but one occurrence $1 - p_k / n p$, respectively.
 
-The same problem arives in gradient optimization. Therefore, before starting the optimization, we detect degenerate combinations. We compute upper and lower bounds for the ``degenerate'' element in the Kronecker product $\hat{\mat{\Omega}} = \bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k$. After every gradient update, we check if any of the ``degenerate'' elements fall outside of the bounds. In that case, we adjust all the elements of the Kronecker component estimates $\hat{\mat{\Omega}}_k$, corresponding to the ``degenerate'' element of their Kronecker product, to fall inside the precomputed bounds. While doing so, we try to alter every component as little as possible to ensure that the non-degenerate elements in $\hat{\mat{\Omega}}$, effected by this change due to its Kronecker structure, are altered as little as possible. The exact details are technically cumbersome while providing little insight. \todo{For more details we refer the reader to the source code prodived with the supplemental material.}
+The same problem is present in gradient optimization. Therefore, before starting the optimization, we detect degenerate combinations. We compute upper and lower bounds for the ``degenerate'' element in the Kronecker product $\hat{\mat{\Omega}} = \bigkron_{k = r}^{1}\hat{\mat{\Omega}}_k$. After every gradient update, we check if any of the ``degenerate'' elements fall outside of the bounds. In that case, we adjust all the elements of the Kronecker component estimates $\hat{\mat{\Omega}}_k$, corresponding to the ``degenerate'' element of their Kronecker product, to fall inside the precomputed bounds. While doing so, we try to alter every component as little as possible to ensure that the non-degenerate elements in $\hat{\mat{\Omega}}$, effected by this change due to its Kronecker structure, are altered as little as possible. The exact details are technically cumbersome while providing little insight. \todo{For more details we refer the reader to the source code prodived with the supplementary material.}
 
 
 
@@ -736,7 +678,7 @@ For estimation of dimensions $p$ bigger than $20$, we use a Monte-Carlo method t
 \begin{figure}
     \centering
     \includegraphics[]{plots/sim-ising-perft-m2.pdf}
-    \caption{\label{fig:ising-m2-perft}Performance test for computing/estimating the second moment of the Ising model of dimension $p$ using ether the exact method or a Monte-Carlo simulation.}
+    \caption{\label{fig:ising-m2-perft}Performance test for computing/estimating the second moment of the Ising model of dimension $p$ using ether the exact method or a Monte-Carlo (MC) simulation.}
 \end{figure}
 
 
@@ -751,9 +693,9 @@ For estimation of dimensions $p$ bigger than $20$, we use a Monte-Carlo method t
 
 The main problem in obtaining asymptotic results for the MLE of the constrained parameter $\mat{\theta} = (\overline{\ten{\eta}}, \vec\mat{B}, \vech\mat{\Omega})$ stems from the nature of the constraint. We assumed that $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, where the parameter $\mat{B}$ is identifiable. This means that different values of $\mat{B}$ lead to different densities $f_{\mat{\theta}}(\ten{X}\mid Y = y)$, a basic property needed to ensure consistency of parameter estimates, which in turn is needed for asymptotic normality. On the other hand, the components $\mat{\beta}_j$, $j = 1, \ldots, r$, are \emph{not} identifiable, which is a direct consequence of the equality $\mat{\beta}_2\otimes\mat{\beta}_1 = (c\mat{\beta}_2)\otimes (c^{-1}\mat{\beta}_1)$ for every $c\neq 0$. This is the reason we formulated $\Theta$ as a constrained parameter space instead of parameterizing the densities of $\ten{X}\mid Y$ with respect to the components $\mat{\beta}_1, \ldots, \mat{\beta}_r$. The same is true for $\mat{\Omega} = \bigkron_{k = r}^{1}\mat{\Omega}_k$.
 
-In addition to identifiable parameters, asymptotic normality obtained  in \cref{thm:asymptotic-normality-gmlm} requires  differentiation. Therefore, the space itself needs to admit  defining differentiation, which is usually a vector space. This is too strong an assumption for our purposes. To weaken the vector space assumption we consider \emph{smooth manifolds}. The latter are  spaces which look like Euclidean spaces locally and allow the notion of differentiation. The more general \emph{topological} manifolds are too weak for differentiation. To make matters worse, a smooth manifold only allows first derivatives. Without going into  details, the solution is a \emph{Riemannian manifold}. Similar to an abstract \emph{smooth manifold}, Riemannian manifolds are detached from our usual intuition as well as complicated to handle in an already complicated setting. This is where an \emph{embedded (sub)manifold} comes to rescue. Simply speaking, an embedded manifold is a manifold which is a subset of a manifold from which it inherits its properties. If a manifold is embedded in a Euclidean space, almost all the complication of the abstract manifold theory simplifies drastically. Moreover, since a Euclidean space is itself a Riemannian manifold, we inherit the means for higher derivatives. Finally, smooth embedded submanifold structure for the parameter space maintains consistency with existing approaches and results for parameter sets with linear subspace structure. These reasons justify the constraint that the parameter space $\Theta$ be an \emph{smooth embedded submanifold} in an open subset $\Xi$ of a Euclidean space.
+In addition to identifiable parameters, asymptotic normality obtained  in \cref{thm:asymptotic-normality-gmlm} requires  differentiation. Therefore, the space itself needs to admit  defining differentiation, which is usually a vector space. This is too strong an assumption for our purposes. To weaken the vector space assumption we consider \emph{smooth manifolds}. The latter are  spaces which look like Euclidean spaces locally and allow the notion of differentiation. The more general \emph{topological} manifolds are too weak for differentiation. To make matters worse, a smooth manifold only allows first derivatives. Without going into  details, the solution is a \emph{Riemannian manifold}. Similar to an abstract \emph{smooth manifold}, Riemannian manifolds are detached from our usual intuition as well as complicated to handle in an already complicated setting. This is where an \emph{embedded (sub)manifold} comes to the rescue. Simply speaking, an embedded manifold is a manifold which is a subset of a manifold from which it inherits its properties. If a manifold is embedded in a Euclidean space, almost all the complication of the abstract manifold theory simplifies drastically. Moreover, since a Euclidean space is itself a Riemannian manifold, we inherit the means for higher derivatives. Finally, smooth embedded submanifold structure for the parameter space maintains consistency with existing approaches and results for parameter sets with linear subspace structure. These reasons justify the constraint that the parameter space $\Theta$ be an \emph{smooth embedded submanifold} in an open subset $\Xi$ of a Euclidean space.
 
-Now, we directly define a \emph{smooth manifold} embedded in $\mathbb{R}^p$ without any detours to the more generel theory. See for example \textcite{introToSmoothMani-Lee2012,,introToRiemannianMani-Lee2018,optimMatrixMani-AbsilEtAl2007,aufbauAnalysis-kaltenbaeck2021} among others.
+Now, we directly define a \emph{smooth manifold} embedded in $\mathbb{R}^p$ without any detours to the more generel theory. See for example \textcite{Lee2012,,Lee2018,AbsilEtAl2007,Kaltenbaeck2021} among others.
 \begin{definition}[Manifolds]\label{def:manifold}
     A set $\manifold{A}\subseteq\mathbb{R}^p$ is an \emph{embedded smooth manifold} of dimension $d$ if for every $\mat{x}\in\manifold{A}$ there exists a smooth\footnote{Here \emph{smooth} means infinitely differentiable or $C^{\infty}$.} bi-continuous map $\varphi:U\cap\manifold{A}\to V$, called a \emph{chart}, with $\mat{x}\in U\subseteq\mathbb{R}^p$ open and $V\subseteq\mathbb{R}^d$ open.
 \end{definition}
@@ -773,7 +715,7 @@ We also need the concept of a \emph{tangent space} to formulate asymptotic norma
 \begin{figure}
     \centering
     \includegraphics[width = 0.5\textwidth]{images/TorustangentSpace.pdf}
-    \caption{\label{fig:torus}Visualization of the tangent space $T_{\mat{x}}\manifold{A}$ at $\mat{x}$ of the torus $\manifold{A}$. The torus $\manifold{A}$ is a 2-dimensional embedded manifold in $\mathbb{R}^3$. The tangent space $T_{\mat{x}}\manifold{A}\subset\mathbb{R}^3$ is a the 2-dimensional hyperplane visualized with its origin $\mat{0}$ shifted to $\mat{x}$. Moreover, two curves $\gamma_1, \gamma_2$ on the torus are drawn with $\mat{x} = \gamma_1(0) = \gamma_2(0)$. The curve velocity vectors $\t{\nabla\gamma_1(0)}$ and $\t{\nabla\gamma_2(0)}$ are drawn as tangent vectors with root $\mat{x}$.}
+    \caption{\label{fig:torus}Visualization of the tangent space $T_{\mat{x}}\manifold{A}$ at $\mat{x}$ of the torus $\manifold{A}$. The torus $\manifold{A}$ is a 2-dimensional embedded manifold in $\mathbb{R}^3$. The tangent space $T_{\mat{x}}\manifold{A}\subset\mathbb{R}^3$ is a 2-dimensional hyperplane visualized with its origin $\mat{0}$ shifted to $\mat{x}$. Moreover, two curves $\gamma_1, \gamma_2$ on the torus are drawn with $\mat{x} = \gamma_1(0) = \gamma_2(0)$. The curve velocity vectors $\t{\nabla\gamma_1(0)}$ and $\t{\nabla\gamma_2(0)}$ are drawn as tangent vectors with root $\mat{x}$.}
 \end{figure}
 
 
@@ -799,7 +741,7 @@ As a basis to ensure that the constrained parameter space $\Theta$ is a manifold
         \quad\text{and}\quad
         \manifold{K}_{\mat{\Omega}} = \Bigl\{ \bigkron_{k = r}^{1}\mat{\Omega}_k : \mat{\Omega}_k\in\manifold{O}_k \Bigr\}
     \end{displaymath}
-    where $\manifold{B}_k\subset\mathbb{R}^{p_k\times q_k}\backslash\{\mat{0}\}$ and $\manifold{O}_k\subset\mathbb{R}^{p_k\times p_k}\backslash\{\mat{0}\}$ are smooth embedded manifolds which are either spheres or cones, for $k = 1, ..., r$. Furthermore, let
+    where $\manifold{B}_k\subseteq\mathbb{R}^{p_k\times q_k}\backslash\{\mat{0}\}$ and $\manifold{O}_k\subseteq\mathbb{R}^{p_k\times p_k}\backslash\{\mat{0}\}$ are smooth embedded manifolds which are either spheres or cones, for $k = 1, ..., r$. Furthermore, let
     \begin{displaymath}
         \manifold{CK}_{\mat{\Omega}} = \{ \vech{\mat{\Omega}} : \mat{\Omega}\in\manifold{K}_{\mat{\Omega}} \land \pinv{(\mat{T}_2\pinv{\mat{D}_p})}\mat{T}_2\pinv{\mat{D}_p}\vec{\mat{\Omega}} = \vec{\mat{\Omega}} \}
     \end{displaymath}
@@ -872,7 +814,7 @@ It is not necessary to have a perfect maximizer, as long as the objective has fi
 
 
 \begin{theorem}[Asymptotic Normality]\label{thm:asymptotic-normality-gmlm}
-    Assume $Z = (\ten{X}, Y)$ satisfies model \eqref{eq:quadratic-exp-fam} subject to \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} with true constrained parameter $\mat{\theta}_0 = (\overline{\eta}_0, \mat{B}_0, \mat{\Omega}_0)\in\Theta$, where $\Theta$ is defined in \cref{thm:param-manifold}. Under the regularity \crefrange{cond:differentiable-and-convex}{cond:finite-sup-on-compacta} in the appendix, there exists a strong M-estimator sequence $\hat{\mat{\theta}}_n$ deriving from $l_n$ in \eqref{eq:log-likelihood} over $\Theta$. Furthermore, any strong M-estimator $\hat{\mat{\theta}}_n$ converges in probability to the true parameter $\mat{\theta}_0$ over $\Theta$. That is, $ \hat{\mat{\theta}}_n\xrightarrow{p}\mat{\theta}_0$.  Moreover, every strong M-estimator $\hat{\mat{\theta}}_n$ is asymptotically normal,
+    Assume $Z = (\ten{X}, Y)$ satisfies model \eqref{eq:quadratic-exp-fam} subject to \eqref{eq:eta1-manifold} and \eqref{eq:eta2-manifold} with true constrained parameter $\mat{\theta}_0 = (\overline{\eta}_0, \mat{B}_0, \mat{\Omega}_0)\in\Theta$, where $\Theta$ is defined in \cref{thm:param-manifold}. Under the regularity \crefrange{cond:differentiable-and-convex}{cond:finite-sup-on-compacta} in the appendix, there exists a strong M-estimator sequence $\hat{\mat{\theta}}_n$ deriving from $l_n$ in \eqref{eq:log-likelihood} over $\Theta$. Furthermore, any strong M-estimator $\hat{\mat{\theta}}_n$ converges in probability to the true parameter $\mat{\theta}_0$ over $\Theta$. That is, $ \hat{\mat{\theta}}_n\xrightarrow{p}\mat{\theta}_0$. Moreover, every strong M-estimator $\hat{\mat{\theta}}_n$ is asymptotically normal,
     \begin{displaymath}
         \sqrt{n}(\hat{\mat{\theta}}_n - \mat{\theta}_0) \xrightarrow{d} \mathcal{N}_{p(p + 2 q + 3) / 2}(0, \mat{\Sigma}_{\mat{\theta}_0})
     \end{displaymath}
@@ -884,7 +826,7 @@ It is not necessary to have a perfect maximizer, as long as the objective has fi
 \subsection{Asymptotic Normality}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-The following is a reformulation of \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2018} which assumes Condition~2.2 to hold. The existence of a mapping in Condition~2.2 is not needed for Lemma~2.3. It suffices that the restricted parameter space $\Theta$ is a subset of the unrestricted parameter space $\Xi$, which is trivially satisfied in our setting. Under this, \cref{thm:exists-strong-M-estimator-on-subsets} follows directly from \textcite[Lemma~2.3]{asymptoticMLE-BuraEtAl2018}.
+The following is a reformulation of \textcite[Lemma~2.3]{BuraEtAl2018} which assumes Condition~2.2 to hold. The existence of a mapping in Condition~2.2 is not needed for Lemma~2.3. It suffices that the restricted parameter space $\Theta$ is a subset of the unrestricted parameter space $\Xi$, which is trivially satisfied in our setting. Under this, \cref{thm:exists-strong-M-estimator-on-subsets} follows directly from \textcite[Lemma~2.3]{BuraEtAl2018}.
 
 \begin{theorem}[Existence of strong M-estimators on Subsets]\label{thm:exists-strong-M-estimator-on-subsets}
     Assume there exists a (weak/strong) M-estimator $\hat{\mat{\xi}}_n$ for $M_n$ over $\Xi$, then there exists a strong M-estimator $\hat{\mat{\theta}}_n$ for $M_n$ over any non-empty $\Theta\subseteq\Xi$.
@@ -900,9 +842,6 @@ for every non-empty compact $K\subset\Xi$.     Then, there exists a strong M-est
 \end{theorem}
 
 
-
-\todo{The assumptions of the following can be a bit weakened, is this neccessary? For example the Hessian can be singular but is non-singular constrained to the tangent space. We can also only define $\mat{\theta}\mapsto m_{\mat{\theta}}$ only on the manifold which makes the statement much more technical, but way more general while we need to ensure that every smooth local extension of $\mat{\theta}\mapsto m_{\mat{\theta}}$ yields the same statement, which it does, but well, then it gets more complicated! Maybe add these things as a remark? The even more general formulation for Riemannian Manifolds is definitely over the top!}
-
 \begin{theorem}[Asymptotic Normality for M-estimators on Manifolds]\label{thm:M-estimator-asym-normal-on-manifolds}
     Let $\Theta\subseteq\mathbb{R}^p$ be a smooth embedded manifold. For each $\mat{\theta}$ in a neighborhood in $\mathbb{R}^p$ of the true parameter $\mat{\theta}_0\in\Theta$ let $z\mapsto m_{\mat{\theta}}(z)$ be measurable and $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ be differentiable at $\mat{\theta}_0$ for almost all $z$. Assume also that there exists a measurable function $u$ such that $\E[u(Z)^2] < \infty$, and for almost all $z$ as well as all $\mat{\theta}_1, \mat{\theta}_2$ in a neighborhood of $\mat{\theta}_0$ such that
     \begin{displaymath}
@@ -919,18 +858,18 @@ for every non-empty compact $K\subset\Xi$.     Then, there exists a strong M-est
 
 
 \begin{remark}
-    \cref{thm:M-estimator-asym-normal-on-manifolds} has as special case Theorem~5.23 in \textcite{asymStats-van_der_Vaart1998}, when $\Theta$ is open subset of a Euclidean space as opposed to a smooth embedded manifold. 
-    \todo{I don't like it that much, mention that an open set if an embedded manifold implying that $\mat{\Pi}_{\mat{\theta}_0} = \mat{H}_{\mat{\theta}_0}^{-1}$}
+    \cref{thm:M-estimator-asym-normal-on-manifolds} has as special case Theorem~5.23 in \textcite{vanderVaart1998}, when $\Theta$ is an open subset of a Euclidean space, which is the simplest form of an embedded manifold.
 \end{remark}
 
+%%% I decided its fine to just NOT state those posibilities. Just gets comfusing without any additional insigt!
+% \todo{The assumptions of the following can be a bit weakened, is this necessary? For example the Hessian can be singular but is non-singular constrained to the tangent space. We can also only define $\mat{\theta}\mapsto m_{\mat{\theta}}$ only on the manifold which makes the statement much more technical, but way more general while we need to ensure that every smooth local extension of $\mat{\theta}\mapsto m_{\mat{\theta}}$ yields the same statement, which it does, but well, then it gets more complicated! Maybe add these things as a remark? The even more general formulation for Riemannian Manifolds is definitely over the top!}
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Simulations}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-In this section we provide simulation results for the tensor normal as well as the Ising model where different aspects of the GMLM model are compaired against other methods. The comparison methods are tensor Sliced Inverse Regression (TSIR) \textcite{tsir-DingCook2015}, MGCCA \textcite{MGCCA-GirkaEtAl2024} and the Tucker decomposition that is a higher-order form of principal component analysis (HOPCA) \textcite{KoldaBader2009}, for both continuous and binary data. For the latter, the binary values are simply treated as continuous. As a base line we also include classic PCA on vectorized observations. \todo{check, fix, ...}
+In this section we report simulation results for the tensor normal and the Ising model where different aspects of the GMLM model are compared against other methods. The comparison methods are Tensor Sliced Inverse Regression (TSIR) \parencite{DingCook2015}, MGCCA \todo{\parencite{ChenEtAl2021,GirkaEtAl2024}} and the Tucker decomposition that is a higher-order form of principal component analysis (HOPCA) \textcite{KoldaBader2009}, for both continuous and binary data. For the latter, the binary values are  treated as continuous. As a base line we also include classic PCA on vectorized observations. In case of the Ising model, we also compare with LPCA (Logistic PCA) and CLPCA (Convex Logistic PCA), both introduced in \textcite{LandgrafLee2020}. All experiments are performed at sample size $n = 100, 200, 300, 500$ and $750$. Every experiment is repeated $100$ times.
 
-All experiments are performed with different sample sizes $n = 100, 200, 300, 500$ and $750$. Every experiment is repeated $100$ times.
-
-We are interested in the quality of the estimate of the true sufficient reduction $\ten{R}(\ten{X})$ from \cref{thm:sdr}. Therefore, we compare with  the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, as it is compatible with any linear reduction method. The distance $d(\mat{B}, \hat{\mat{B}})$ between $\mat{B}\in\mathbb{R}^{p\times q}$ and an estimate $\hat{\mat{B}}\in\mathbb{R}^{p\times \tilde{q}}$ is the \emph{subspace distance} which is proportional to
+We are interested in the quality of the estimate of the true sufficient reduction $\ten{R}(\ten{X})$ from \cref{thm:sdr}. Therefore, we compare with the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$, as it is compatible with any linear reduction method. The distance $d(\mat{B}, \hat{\mat{B}})$ between $\mat{B}\in\mathbb{R}^{p\times q}$ and an estimate $\hat{\mat{B}}\in\mathbb{R}^{p\times \tilde{q}}$ is the \emph{subspace distance} which is proportional to
 \begin{displaymath}
     d(\mat{B}, \hat{\mat{B}}) \propto \| \mat{B}\pinv{(\t{\mat{B}}\mat{B})}\t{\mat{B}} - \hat{\mat{B}}\pinv{(\t{\hat{\mat{B}}}\hat{\mat{B}})}\t{\hat{\mat{B}}} \|_F,
 \end{displaymath}
@@ -939,70 +878,94 @@ the Frobenius norm of the difference between the projections onto the span of $\
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Tensor Normal}\label{sec:sim-tensor-normal}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-For every tensor normal model we draw i.i.d. samples $\ten{X}_i$ for $i = 1, ..., n$ from the conditional distribution of $\ten{X}\mid Y = y_i$ where $y_i$ is an i.i.d. sample from the standard normal distribution. The conditional distribution $\ten{X}\mid Y = y_i$ depends on the choice of the GMLM parameters $\overline{\ten{\eta}}$, $\mat{\beta}_1, ..., \mat{\beta}_r$, $\mat{\Omega}_1, ..., \mat{\Omega}_r$, and the function $\ten{F}_y$ of $y$. In all experiments we set $\overline{\ten{\eta}} = \mat{0}$. The other parameters and $\ten{F}_y$ are described per experiment. With the true GMLM parameters and $\ten{F}_y$ given, we compute the conditional tensor normal mean $\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k$ and covariances $\mat{\Sigma}_k = \mat{\Omega}_k^{-1}$ as in \eqref{eq:tnormal_cond_params}.
+We generate a random sample $y_i$, $i=1,\ldots, n$,  from the standard normal distribution. We then draw i.i.d. samples $\ten{X}_i$ for $i = 1, ..., n$ from the conditional tensor normal distribution of $\ten{X}\mid Y = y_i$. The conditional distribution $\ten{X}\mid Y = y_i$ depends on the choice of the GMLM parameters $\overline{\ten{\eta}}$, $\mat{\beta}_1, ..., \mat{\beta}_r$, $\mat{\Omega}_1, ..., \mat{\Omega}_r$, and the function $\ten{F}_y$ of $y$. In all experiments we set $\overline{\ten{\eta}} = \mat{0}$. The other parameters and $\ten{F}_y$ are described per experiment. With the true GMLM parameters and $\ten{F}_y$ given, we compute the conditional tensor normal mean $\ten{\mu}_y = \ten{F}_y\mlm_{k = 1}^{r}\mat{\Omega}_k^{-1}\mat{\beta}_k$ and covariances $\mat{\Sigma}_k = \mat{\Omega}_k^{-1}$ as in \eqref{eq:tnormal_cond_params}.
 
-We start with a $1$ dimensional linear dependence on $y$ in 1a). Then, the dependence of $y$ is via a cubic polynomial 1b-d). In 1b) reduction is full rank, in constrast to 1c) where the $\mat{\beta}_k$'s are of rank $1$, in other words, low rank regression. In 1d) we constrain the inverse covariances $\mat{\Omega}_k$ to be tri-diagonal. Both, 1c-d) are examples of building the parameter space according to \cref{thm:param-manifold}. The final tensor normal experiment 1e) is a model missspecification. The true model does \emph{not} have a Kronecker structure and the ``known'' function $\ten{F}_y$ of $y$ is missspecified as well.
+We consider the following settings: 
+%We start with a $1$ dimensional linear dependence on $y$ in 1a). Then, the dependence of $y$ is via a cubic polynomial 1b-d). In 1b) reduction is full rank, in contrast to 1c) where the $\mat{\beta}_k$'s are of rank $1$, in other words, low rank regression. In 1d) we constrain the inverse covariances $\mat{\Omega}_k$ to be tri-diagonal. Both, 1c-d) are examples of building the parameter space according to \cref{thm:param-manifold}. The final tensor normal experiment 1e) is a model misspecification. The true model does \emph{not} have a Kronecker structure and the ``known'' function $\ten{F}_y$ of $y$ is misspecified as well.
 
 \begin{itemize}
-    \item[1a)] The predictors $\ten{X}$ are $2\times 3\times 5$ dimensional, that is $r = 3$. The dependence through the inverse regression model is linear specifically means that $\ten{F}_y\equiv y$ is a $1\times 1\times 1$ tensor. The true $\mat{\beta}_k$'s are all equal to $\mat{e}_1\in\mathbb{R}^{p_k}$, the first unit vector, for $k \in \{1, 2, 3\}$. The matrices $\mat{\Omega}_k = \mathrm{AR}(0.5)$ follow an auto-regression like structure. That is, the elements are given by $(\mat{\Omega}_k)_{i j} = 0.5^{|i - j|}$.
-    \item[1b)] The predictors $\ten{X}$ are again $3$ dimensional with dimension $2\times 3\times 5$ which relates to the response $y$ via a qubic polynomial. This is modeled via $\ten{F}_y$ of dimension $2\times 2\times 2$ by the twice iterated outer product of the vector $(1, y)$. Element wise this reads $(\ten{F}_y)_{i j k} = y^{i + j + k - 3}$. All $\mat{\beta}_k$'s are set to $(\mat{e}_1, \mat{e}_2)\in\mathbb{R}^{p_k\times 2}$ with $\mat{e}_i$ the $i$'th unit vector and the $\mat{\Omega}_k$'s are $\mathrm{AR}(0.5)$.
-    \item[1c)] Similar to 1b), except that the GMLM parameters $\mat{\beta}_k$ are rank $1$ given by
+    \item[1a)] $\ten{X}$ is a three-way ($r = 3$) array of dimension  $2\times 3\times 5$,  %The dependence through the inverse regression model is linear specifically means that 
+    and $\ten{F}_y\equiv y$ is a $1\times 1\times 1$ tensor. The true $\mat{\beta}_k$'s are all equal to $\mat{e}_1\in\mathbb{R}^{p_k}$, the first unit vector, for $k \in \{1, 2, 3\}$. The matrices $\mat{\Omega}_k = \mathrm{AR}(0.5)$ follow an auto-regression like structure. That is, the elements are given by $(\mat{\Omega}_k)_{i j} = 0.5^{|i - j|}$.
+    \item[1b)] $\ten{X}$ is a three-way ($r = 3$) array of dimension  $2\times 3\times 5$, and relates to the response $y$ via a qubic polynomial. This is modeled via $\ten{F}_y$ of dimension $2\times 2\times 2$ by the twice iterated outer product of the vector $(1, y)$. Element wise this reads $(\ten{F}_y)_{i j k} = y^{i + j + k - 3}$. All $\mat{\beta}_k$'s are set to $(\mat{e}_1, \mat{e}_2)\in\mathbb{R}^{p_k\times 2}$ with $\mat{e}_i$ the $i$th unit vector and the $\mat{\Omega}_k$'s are $\mathrm{AR}(0.5)$.
+    \item[1c)] Same as 1b), except that the GMLM parameters $\mat{\beta}_k$ are rank $1$ given by
         \begin{displaymath}
             \mat{\beta}_1 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \end{pmatrix},\quad
             \mat{\beta}_2 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \\ 1 & -1 \end{pmatrix},\quad
             \mat{\beta}_3 = \begin{pmatrix} 1 & -1 \\ -1 & 1 \\ 1 & -1 \\ -1 & 1 \\ 1 & -1 \end{pmatrix}.
         \end{displaymath}
-    \item[1d)] Again like 1b). This time the true $\mat{\Omega}_k$'s, for $k = 1, 2, 3$, are tri-diagonal. Their elements are given by $(\mat{\Omega}_k)_{i j} = \delta_{0, |i - j|} + 0.5\delta_{1, |i - j|}$ with $\delta_{i, j}$ being the Kronecker delta.
-    \item[1e)] For the missspecification model we let $\ten{X}\mid Y$ be multivariate normal but \emph{not} tensor normal. Let $\ten{X}$ be $5\times 5$ dimensional, $Y$ is univariate standard normal and $\mat{f}_y$ is a $4$ dimensional vector given by $\mat{f}_y = (1, \sin(y), \cos(y), \sin(y)\cos(y))$. The true vectorized reduction matrix $\mat{B}$ is $25\times 4$ consisting of the first $4$ columns of the identify. The variance-covariance matrix $\mat{\Sigma}$ is an auto-regression like structure with correlation coefficient $0.5$. Element wise $\mat{B}_{i j} = \delta_{i j}$ and $\mat{\Sigma}_{i j} = 0.5^{|i - j|}$. Both, $\mat{B}$ and $\mat{\Omega} = \mat{\Sigma}^{-1}$ violate the Kronecker product assumptions \eqref{eq:eta1} and \eqref{eq:eta2} of the GMLM model. Then, we set
+    \item[1d)] Same as 1b), but  the true $\mat{\Omega}_k$ is tri-diagonal, for $k = 1, 2, 3$. Their elements are given by $(\mat{\Omega}_k)_{i j} = \delta_{0, |i - j|} + 0.5\delta_{1, |i - j|}$ with $\delta_{i, j}$ being the Kronecker delta.
+    \item[1e)] For the misspecification model we let $\ten{X}\mid Y$ be multivariate  but \emph{not} tensor normal. Let $\ten{X}$ be a $5\times 5$ random matrix with normal entries, $Y$ univariate standard normal and $\mat{f}_y$ a $4$ dimensional vector given by $\mat{f}_y = (1, \sin(y), \cos(y), \sin(y)\cos(y))$. The true vectorized reduction matrix $\mat{B}$ is $25\times 4$ consisting of the first $4$ columns of the identity; i.e., $\mat{B}_{i j} = \delta_{i j}$. The variance-covariance matrix $\mat{\Sigma}$ has elements $\mat{\Sigma}_{i j} = 0.5^{|i - j|}$. %is an auto-regression like structure with correlation coefficient $0.5$. 
+    Both, $\mat{B}$ and $\mat{\Omega} = \mat{\Sigma}^{-1}$ violate the Kronecker product assumptions \eqref{eq:eta1} and \eqref{eq:eta2} of the GMLM model. Then, we set
     \begin{displaymath}
         \vec{\ten{X}}\mid (Y = y) = \mat{B}\mat{f}_y + \mathcal{N}_{25}(\mat{0}, \mat{\Sigma}).
     \end{displaymath}
-    Furthermore, we fit the model with the wrong ``known'' function $\ten{F}_y$. We set $\ten{F}_y$ to be a $2\times 2$ matrix with a quadratic linkage via elements given by $(\ten{F}_y)_{i j} = y^{|i - j|}$.
+    Furthermore, we fit the model with the wrong ``known'' function $\ten{F}_y$. We set $\ten{F}_y$ to be a $2\times 2$ matrix with $(\ten{F}_y)_{i j} = y^{i + j - 2}$, $i,j=1,2$.
 \end{itemize}
 
-\todo{How to describe that? I mean, sure, but what to write?}
-The results of 1c) are surprising. The GMLM model behaves as expected, clearly being the best. The first surprise is that PCA, HOPCA and MGCCA are visually indistinguishable. This is explained by a high signal to noise ration in this particular example. But the biggest surprise is the failure of TSIR. It turns out that TSIR is usualy well equiped to handle those specific low rank problems (given the true rank of the problem which is the case for all methods in every simulation), but by pure coincidense we picked a case where TSIR failes. Intending to pinpoint the specific problem we made another simulation where we change the tensor order $r$ from $2$ till $4$. Furthermore, we altered the coefficient $\rho$ for the auto regression type matrices $(\mat{\Omega}_k)_{i j} = \rho^{|i - j|}$. We let $\ten{F}_y$ be the $r$ times iterated outer product of the vector $(1, y)$. In the case of $r = 3$ this given the same $\ten{F}_y$ as in 1c). Then, we setup two scenarios where the definition of the true $\mat{\beta}_k$'s of rank $1$, for $k = 1, \ldots, r$, are different. The rest is identical to simulation 1c).
-\begin{itemize}
-    \item[V1)] The first version sets all $\mat{\beta}_k$'s identical to
-    \begin{displaymath}
-        \mat{\beta}_k = \begin{pmatrix} 1 & 1 \\ 0 & 0 \end{pmatrix}
-    \end{displaymath}
-    which gives the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$ equal to a $2^r\times 2^r$ rank $1$ matrix with the first row all ones and the rest filled with zeros. The minimal true reduction is the $2^r$ dimensional first unit vector $\mat{e}_1$. In this setting the vectorized expected value is given by $\E[\vec{\ten{X}} \mid Y = y] = (1 + y)^r \bigkron_{k = r}^{1}\mat{\Omega}_k^{-1}$
+The final tensor normal experiment 1e) is a misspecified model to explore the robustness of our approach. The true model does \emph{not} have a Kronecker structure and the ``known'' function $\ten{F}_y$ of $y$ is misspecified as well.
 
-    \begin{displaymath}
-        \mat{\Omega}_k = \begin{pmatrix}
-            1 & \rho \\ \rho & 1
-        \end{pmatrix}
-    \end{displaymath}
-    \begin{displaymath}
-        \mat{\Sigma}_k = \mat{\Omega}_k^{-1} = \frac{1}{1 - \rho^2}\begin{pmatrix}
-            1 & -\rho \\ -\rho & 1
-        \end{pmatrix}
-    \end{displaymath}
 
-    \begin{displaymath}
-        \E[\ten{X} \mid Y = y] = \frac{(1 + y)^r}{(1 - \rho^2)^r}\bigouter_{k = 1}^{r}\begin{pmatrix}
-            1 \\ -\rho
-        \end{pmatrix}
-    \end{displaymath}
-    \begin{displaymath}
-        \E[\vec{\ten{X}} \mid Y = y] = \frac{(1 + y)^r}{(1 - \rho^2)^r}\bigkron_{k = r}^{1}\begin{pmatrix}
-            1 \\ -\rho
-        \end{pmatrix}
-    \end{displaymath}
+\begin{figure}[hp!]
+    \centering
+    \includegraphics[width = \textwidth]{plots/sim-normal.pdf}
+    \caption{\label{fig:sim-normal}Visualization of the simulation results for the tensor normal GMLM. Sample size on the $x$-axis and the mean of subspace distance $d(\mat{B}, \hat{\mat{B}})$ over $100$ replications on the $y$-axis. Described in \cref{sec:sim-tensor-normal}.}
+\end{figure}
 
-    In this setting only the first component of $\ten{X}$, that is $(\vec{\ten{X}})_1$, depends directly on $Y$ via $\E[(\vec{\ten{X}})_1\mid Y = y] = (1 + y)^r$. All other components contain information about $Y$ through the correlation structure only. \todo{check this!}
-    \item[V2)] Similar to the $\mat{\beta}_k$'s in 1c), we set all $\mat{\beta}_k$'s identical to
-    \begin{displaymath}
-        \mat{\beta}_k = \begin{pmatrix} 1 & -1 \\ -1 & 1 \end{pmatrix}
-    \end{displaymath}
 
-    \begin{displaymath}
-        \E[\vec{\ten{X}}\mid Y = y] = \frac{(1 - y)^r}{(1 - \rho)^r}\bigkron_{k = r}^{1}\begin{pmatrix}
-            1 \\ -1
-        \end{pmatrix}
-    \end{displaymath}
-\end{itemize}
+The results are visualized in \cref{fig:sim-normal}. Simulation 1a), given a 1D linear relation between the response $Y$ and $\ten{X}$, TSIR and GMLM are equivalent. This is expected as \textcite{DingCook2015} already established that TSIR gives the MLE estimate under a tensor (matrix) normal distributed setting. For the other methods, MGCCA is only a bit better than PCA which, unexpectedly, beats HOPCA. But none of them are close to the performance of TSIR or GMLM. Continuing with 1b), where we introduced a qubic relation between $Y$ and $\ten{X}$, we observe a bigger deviation in the performance of GMLM and TSIR. This is caused mainly because we are estimating an $8$ dimensional subspace now, which amplifies the small performance boost, in the subspace distance, we gain by avoiding slicing. The results of 1c) are surprising. The GMLM model behaves as expected, clearly being the best. The first surprise is that PCA, HOPCA and MGCCA are visually indistinguishable. This is explained by a high signal to noise ration in this particular example. But the biggest surprise is the failure of TSIR. Even more surprising is that the conditional distribution $\ten{X}\mid Y$ is tensor normal distributed which in conjunction with $\cov(\vec\ten{X})$ having a Kronecker structure, should give the MLE estimate. The low-rank assumption is also not an issue, this simply relates to TSIR estimating a 1D linear reduction which fulfills all the requirements. Finally, a common known issue of slicing, used in TSIR, is that conditional multi-modal distributions can cause estimation problems due to the different distribution modes leading to vanishing slice means. Again, this is not the case in simulation 1c).
+An investigation into this behaviour revealed the problem in the estimation of the mode covariance matrices $\mat{O}_k = \E[(\ten{X} - \E\ten{X})_{(k)}\t{(\ten{X} - \E\ten{X})_{(k)}}]$. The mode wise reductions provided by TSIR are computed as $\hat{\mat{O}}_k^{-1}\hat{\mat{\Gamma}}_k$ where the poor estimation of $\hat{\mat{O}}_k$ causes the failure of TSIR. The poor estimate of $\mat{O}_k$ is rooted in the high signal to noise ratio in this particular simulation. GMLM does not have degenerate behaviour for high signal to noise ratios but it is less robust in low signal to noise ratio setting where TSIR performs better in this specific example.
+Simulation 1d), incorporating information about the covariance structure behaves similar to 1b), except that GMLM gains a statistically significant lead in estimation performance. The last simulation, 1e), where the model was misspecified for GMLM. GMLM, TSIR as well as MGCCA are on par where GMLM has a sligh lead in the small sample size setting and MGCCA overtakes in higher sample scenarios. The PCA and HOPCA methods both still outperformed. A wrong assumption about the relation to the response is still better than no relation at all.
+
+
+% \todo{How to describe that? I mean, sure, but what to write?}
+% The results of 1c) are surprising. The GMLM model behaves as expected, clearly being the best. The first surprise is that PCA, HOPCA and MGCCA are visually indistinguishable. This is explained by a high signal to noise ration in this particular example. But the biggest surprise is the failure of TSIR. It turns out that TSIR is usually well equipped to handle those specific low rank problems (given the true rank of the problem which is the case for all methods in every simulation), but by pure coincidence we picked a case where TSIR fails. Intending to pinpoint the specific problem we made another simulation where we change the tensor order $r$ from $2$ till $4$. Furthermore, we altered the coefficient $\rho$ for the auto regression type matrices $(\mat{\Omega}_k)_{i j} = \rho^{|i - j|}$. We let $\ten{F}_y$ be the $r$ times iterated outer product of the vector $(1, y)$. In the case of $r = 3$ this given the same $\ten{F}_y$ as in 1c). Then, we setup two scenarios where the definition of the true $\mat{\beta}_k$'s of rank $1$, for $k = 1, \ldots, r$, are different. The rest is identical to simulation 1c).
+% \begin{itemize}
+%    \item[V1)] The first version sets all $\mat{\beta}_k$'s identical to
+%   \begin{displaymath}
+%        \mat{\beta}_k = \begin{pmatrix} 1 & 1 \\ 0 & 0 \end{pmatrix}
+%    \end{displaymath}
+%    which gives the true vectorized reduction matrix $\mat{B} = \bigkron_{k = r}^{1}\mat{\beta}_k$ equal to a $2^r\times 2^r$ rank $1$ matrix with the first row all ones and the rest filled with zeros. The minimal true reduction is the $2^r$ dimensional first unit vector $\mat{e}_1$. In this setting the vectorized expected value is given by $\E[\vec{\ten{X}} \mid Y = y] = (1 + y)^r \bigkron_{k = r}^{1}\mat{\Omega}_k^{-1}$
+%
+%    \begin{displaymath}
+%        \mat{\Omega}_k = \begin{pmatrix}
+%            1 & \rho \\ \rho & 1
+%        \end{pmatrix}
+%    \end{displaymath}
+%    \begin{displaymath}
+%        \mat{\Sigma}_k = \mat{\Omega}_k^{-1} = \frac{1}{1 - \rho^2}\begin{pmatrix}
+%            1 & -\rho \\ -\rho & 1
+%        \end{pmatrix}
+%    \end{displaymath}
+%
+%    \begin{displaymath}
+%        \E[\ten{X} \mid Y = y] = \frac{(1 + y)^r}{(1 - \rho^2)^r}\bigouter_{k = 1}^{r}\begin{pmatrix}
+%            1 \\ -\rho
+%        \end{pmatrix}
+%    \end{displaymath}
+%    \begin{displaymath}
+%        \E[\vec{\ten{X}} \mid Y = y] = \frac{(1 + y)^r}{(1 - \rho^2)^r}\bigkron_{k = r}^{1}\begin{pmatrix}
+%            1 \\ -\rho
+%        \end{pmatrix}
+%    \end{displaymath}
+%
+%    In this setting only the first component of $\ten{X}$, that is $(\vec{\ten{X}})_1$, depends directly on $Y$ via $\E[(\vec{\ten{X}})_1\mid Y = y] = (1 + y)^r$. All other components contain information about $Y$ through the correlation structure only. \todo{check this!}
+%    \item[V2)] Similar to the $\mat{\beta}_k$'s in 1c), we set all $\mat{\beta}_k$'s identical to
+%    \begin{displaymath}
+%        \mat{\beta}_k = \begin{pmatrix} 1 & -1 \\ -1 & 1 \end{pmatrix}
+%    \end{displaymath}
+%
+%    \begin{displaymath}
+%        \E[\vec{\ten{X}}\mid Y = y] = \frac{(1 - y)^r}{(1 - \rho)^r}\bigkron_{k = r}^{1}\begin{pmatrix}
+%            1 \\ -1
+%        \end{pmatrix}
+%    \end{displaymath}
+%\end{itemize}
+
+%\begin{figure}[ht!]
+%    \centering
+%    \includegraphics[width = \textwidth]{plots/sim-tsir.pdf}
+%    \caption{\label{fig:sim-tsir}Simulation to investiage the unexpected failure of TSIR in simulation 1c.}
+%\end{figure}
 
 
 
@@ -1015,30 +978,74 @@ The results of 1c) are surprising. The GMLM model behaves as expected, clearly b
 
 % The setup which is ídentical in both cases is as follows. The response $Y$ is i.i.d. standard normal and the response tensor $\ten{F}_y$ consists of monomials with max order $r$. Its elements are equal to $(\ten{F}_y)_{\mat{i}} = y^{|\mat{i}| - r}$ where $\mat{i}$ is a multi-index of length $r$ and $|mat{i}|$ is the sum of th elements of $\mat{i}$.
 
-\begin{figure}[hp!]
-    \centering
-    \includegraphics[width = \textwidth]{plots/sim-normal.pdf}
-    \caption{\label{fig:sim-normal}Visualization of the simulation results for the tensor normal GMLM. Sample size on the $x$-axis and the mean of subspace distance $d(\mat{B}, \hat{\mat{B}})$ over $100$ replications on the $y$-axis. Described in \cref{sec:sim-tensor-normal}.}
-\end{figure}
 
 
-\begin{figure}[ht!]
-    \centering
-    \includegraphics[width = \textwidth]{plots/sim-tsir.pdf}
-    \caption{\label{fig:sim-tsir}Simulation to investiage the unexpected failure of TSIR in simulation 1c.}
-\end{figure}
-
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Ising Model}
+\subsection{Ising Model}\label{sec:sim-ising}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+Assuming for $\ten{X}$ being a $2\times 3$ dimensional binary matrix with conditional matrix (tensor) Ising distribution $\ten{X}\mid Y$ as in \cref{sec:ising_estimation}. We let for $i = 1, \ldots, n$ the response being i.i.d. uniformly distributed in $[-1, 1]$ establishing the conditional value in the i.i.d. samples from $\ten{X}_i\mid Y = y_i$ with GMLM parameters $\mat{\beta}_1, \mat{\beta}_2$, $\mat{\Omega}_1, \mat{\Omega}_2$. We let
+\begin{displaymath}
+    \mat{\beta}_1 = \begin{pmatrix}
+        1 & 0 \\ 0 & 1
+    \end{pmatrix}, \qquad \mat{\beta}_2 = \begin{pmatrix}
+        1 & 0 \\ 0 & 1 \\ 0 & 0
+    \end{pmatrix}
+\end{displaymath}
+and
+\begin{displaymath}
+    \mat{\Omega}_1 = \begin{pmatrix}
+        0 & -2 \\ -2 & 0
+    \end{pmatrix}, \qquad \mat{\Omega}_2 = \begin{pmatrix}
+        1 & 0.5 & 0 \\
+        0.5 & 1 & 0.5 \\
+        0 & 0.5 & 1
+    \end{pmatrix}
+\end{displaymath}
+as well as
+\begin{displaymath}
+    \ten{F}_y = \begin{pmatrix}
+        \sin(\pi y) & -\cos(\pi y) \\
+        \cos(\pi y) & \sin(\pi y)
+    \end{pmatrix}
+\end{displaymath}
+if not mentioned otherwise in a specific simulation setup given next.
+
+\begin{itemize}
+    \item[2a)] A fully linear relation to the response set to be $\ten{F}_y\equiv y$ being a $1\times 1$ matrix with $\t{\mat{\beta}_1} = (1, 0)$ and $\t{\mat{\beta}_2} = (1, 0, 0)$.
+    \item[2b)] The ``base'' simulation with all parameters as described above.
+    \item[2c)] Low rank regression with both $\mat{\beta}_1$ and $\mat{\beta}_2$ of rank $1$ chosen as
+    \begin{displaymath}
+        \mat{\beta}_1 = \begin{pmatrix}
+            1 & 0 \\ 1 & 0
+        \end{pmatrix}, \qquad \mat{\beta}_2 = \begin{pmatrix}
+            0 & 0 \\ 1 & -1 \\ 0 & 0
+        \end{pmatrix}.
+    \end{displaymath}
+    \item[2d)] We conclude with a simulation relating to the original design of the Ising model. It is a mathematical model to study the behaviour of Ferromagnetism \textcite{Ising1925} in a thermodynamic setting modeling the interaction effects of elementary magnets (spin up/down relating to $0$ and $1$). The model assumes all elementary magnets to be the same, which translates to all having the same coupling strength (two-way interactions) governed by a single parameter relating to the temperature of the system. Assuming the magnets to be arranged in a 2D grid (matrix valued $\ten{X}$), their interactions are constraint to direct neighbours. We can model this by choosing the true $\mat{\Omega}_k$'s to be tri-diagonal matrices with zero diagonal entries and all non-zero entries identical. Since this is a 1D matrix manifold, we can enforce the constraint. Setting the true interaction parameters to be
+    \begin{displaymath}
+        \mat{\Omega}_1 = \frac{1}{2}\begin{pmatrix}
+            0 & 1 \\ 1 & 0
+        \end{pmatrix}, \qquad \mat{\Omega}_2 = \begin{pmatrix}
+            0 & 1 & 0 \\
+            1 & 0 & 1 \\
+            0 & 1 & 0
+        \end{pmatrix}
+    \end{displaymath}
+    where $1 / 2$ relates to an arbitrary temperature. The mean effect depending on $\ten{F}_y$ can be interpreted as an external magnetic field.
+\end{itemize}
 
 \begin{figure}[ht!]
     \centering
     \includegraphics[width = \textwidth]{plots/sim-ising.pdf}
-    \caption{\label{fig:sim-ising}asknclknasknc}
+    \caption{\label{fig:sim-ising}Visualization of the simulation results for Ising GMLM. Sample size on the $x$-axis and the mean of subspace distance $d(\mat{B}, \hat{\mat{B}})$ over $100$ replications on the $y$-axis. Described in \cref{sec:sim-ising}.}
 \end{figure}
 
+The simulation results are visualized in \cref{fig:sim-ising}. Regardless of the simulation setting 2a-d), the comparative results are similar. We observe that PCA and HOPCA, both treating the response $\ten{X}$ as continuous, perform poorly. Not much better are LPCA and CLPCA. Similar to PCA and HOPCA they do not consider the relation to the response, but they are specifically created for binary predictors. Next we have MGCCA which is the first method considering the relation to the response $y$, clearly out-performing all the PCA variants. Even better is TSIR, regardless of the treatment of the predictors $\ten{X}$ as continuous, achieving very god results. Finally, the Ising GMLM model is the best in all the simulations although TSIR gets very close in some settings.
+
+% Due to the surprisingly good result of TSIR, we also applied the tensor normal GMLM model to the exact same simulation, simply treating the response $\ten{X}$ as continuous.
+% The raw linear reduction estimates of both the Ising GMLM and the tensor normal GMLM are basically indistinguishable, similar to the very similar results of the different PCA variants. The main reason for this specific
+
 % \begin{table}
 %     \begin{tabular}{c | ccc ccc c}
 %         $n$ &             GMLM  &         PCA &       HOPCA &       LPCA  &       CLPCA &        TSIR &       MGCCA \\
@@ -1057,15 +1064,47 @@ The results of 1c) are surprising. The GMLM model behaves as expected, clearly b
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Data Analysis}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-In this section be perform two \todo{realy two!} applications of the GMLM model on real data. First example is the tensor normal model applied to EEG data\todo{do this!}. Next, we perform a prove of concept data analysis example for chess. The main purpose of choosing chess is two fold. First, we can ilustrate an explicit use case for the (till now ignored) linear constraint matrix $\mat{T}_2$. Second, its a personal interest of one of authors.\todo{???}
+In this section we perform two applications of the GMLM model on real data. First example is the tensor normal model applied to EEG data. Next, we perform a prove of concept data analysis example for chess. % The main purpose of choosing chess is two fold. First, we can illustrate an explicit use case for the (till now ignored) linear constraint matrix $\mat{T}_2$\todo{check!}. Second, its a personal interest of one of the authors.\todo{???}
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{EEG}
+The EEG data (\url{http://kdd.ics.uci.edu/databases/eeg/eeg.data.html}) is a small study of $77$ alcoholic and $45$ control subjects. Each data point corresponding to a subject consists of a $p_1\times p_2 = 256\times 64$ matrix, with each row representing a time point and each column a channel. The measurements were obtained by exposing each individual to visual stimuli and measuring voltage values from $64$ electrodes placed on the subjects' scalps sampled at $256$ time points over $1$ second ($256$ Hz). Different stimulus conditions were used, and for each condition, $120$ trials were measured. We used only a single stimulus condition (S1), and for each subject, we took the average of all the trials under that condition. That is, we used $(\ten{X}_i, y_i)$, $i = 1, \ldots, 122$, where $\ten{X}_i$ is a $256\times 64$ matrix, with each entry representing the mean voltage value of subject $i$ at a combination of a time point and a channel, averaged over all trials under the S1 stimulus condition, and $Y$ is a binary outcome variable with $Y_i = 1$ for an alcoholic and $Y_i = 0$ for a control subject.
+
+For a comparison we reproduced the leave-one-out cross-validation EEG data analysis \textcite[Sec. 7]{PfeifferKaplaBura2021} for the classification task. In this data set, $p= p_1 p_2 = 16384$ is much larger than $n=122$. To deal with this issue, \textcite{PfeifferKaplaBura2021} used two approaches. In the first, pre-screening via (2D)$^2$PCA \parencite{ZhangZhou2005} reduced the dimensions to $(p_1, p_2) = (3, 4)$, $(15, 15)$ and $(20, 30)$. In the second, simultaneous dimension reductions and variable selection was carried out using the fast POI-C algorithm of \textcite{JungEtAl2019} (due to high computational high burden, only a 10-fold cross-validation was performed for fast POI-C).
+
+In contrast to \textcite{PfeifferKaplaBura2021},  our GMLM model can be applied directly to the  raw data of dimension $(256, 64)$ without pre-screening or variable selection. This was not possible for K-PIR as the time axis alone was in the large $p$ small $n$ regime with the $p_1 = 256 > n = 122$ leading to a singular time axis covariance. The same issue is present in the GMLM model, but the regularization trick used for numerical stability, as described in \cref{sec:tensor_normal_estimation}, resolves this without any change to the estimation procedure.
+In general, the sample size does not need to be large  for maximum likelihood estimation in the tensor normal model. \todo{In matrix normal models in particular,  \textcite{DrtonEtAl2020} proved that very small
+sample sizes are sufficient to obtain unique MLEs for  Kronecker covariance structures.}
+
+We use leave-one-out cross-validation to obtain unbiased AUC estimates. Then, we compare the GMLM model to the best performing methods from \textcite{PfeifferKaplaBura2021}, namely K-PIR (ls) and LSIR from \textcite{PfeifferForzaniBura2012} for $(p_1, p_2) = (3, 4)$, $(15, 15)$ and $(20, 30)$.
+
+In \cref{tab:eeg} we provide the AUC and its standard deviation. For all applied pre-screening dimensions, K-PIR (ls) has an AUC of $78\%$. LSIR performs better at the price of some instability; it peaked at $85\%$ at $(3, 4)$, then dropped down to $81\%$ at $(15, 15)$ and then increased to $83\%$.  In contract, our GMLM method peaked at $(3, 4)$ with $85\%$ and stayed stable at $84\%$, even when no pre-processing was applied. In contrast,  fast POI-C that carries out simultaneous feature extraction and feature selection resulted in an AUC of $63\%$, clearly outperformed by all other methods.
+
+\begin{table}[!hpt]
+    \centering
+    \begin{tabular}{l l| c c}
+        $(p_1, p_2)$ & Method & AUC & St. Dev. \\ \hline
+        $(  3,  4)$ & K-PIR (ls) & $0.78$ & $0.04$ \\
+                    & LSIR       & $0.85$ & $0.04$ \\
+                    & GMLM       & $0.85$ & $0.04$ \\ \hline
+        $( 15, 15)$ & K-PIR (ls) & $0.78$ & $0.04$ \\
+                    & LSIR       & $0.81$ & $0.04$ \\
+                    & GMLM       & $0.84$ & $0.04$ \\ \hline
+        $( 20, 30)$ & K-PIR (ls) & $0.78$ & $0.04$ \\
+                    & LSIR       & $0.83$ & $0.04$ \\
+                    & GMLM       & $0.84$ & $0.04$ \\ \hline
+        $(256, 64)$ & FastPOI-C  & $0.63$\makebox[0pt]{\ \ ${}^{*}$} & $0.22$ \\
+                    & GMLM       & $0.84$ & $0.04$ \\ \hline
+        \multicolumn{4}{l}{${}^{*}$\footnotesize Mean AUC based on 10-fold cross-validation.}
+    \end{tabular}
+    \caption{\label{tab:eeg}Mean AUC values and their standard deviation based on leave-one-out cross-validation for the EEG imaging data (77 alcoholic and 45 control subjects)}
+\end{table}
+
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Chess}\label{sec:chess}
-The data set is provided by the \citetitle{lichess-database}\footnote{\fullcite{lichess-database}}. We downloaded November of 2023 consisting of more than $92$ million games. We removed all games without position evaluations. The evaluations, also denoted scores, are from Stockfish,\footnote{\fullcite{stockfish}} a free and strong chess engine. The scores take the role of the response $Y$ and correspond to a winning probability from whites point of few. Positive scores are good for white and negative scores indicate an advantage for black. We ignore all highly unbalanced positions, which we set to be positions with absolute score above $5$. We also remove all positions with a mate score (one side can force check mate). Furthermore, we only consider positions after $10$ half-moves to avoid oversampling the beginning of the most common openings including the start position which is in every game. Finally, we only consider positions with white to move. This leads to a final data set of roughly $64$ million positions, including duplicates.
+The data set is provided by the \citetitle{lichess-database}\footnote{\fullcite{lichess-database}}. We downloaded November of 2023 consisting of more than $92$ million games. We removed all games without position evaluations. The evaluations, also denoted scores, are from Stockfish\footnote{\fullcite{stockfish}}, a free and strong chess engine. The scores take the role of the response $Y$ and correspond to a winning probability from whites point of few. Positive scores are good for white and negative scores indicate an advantage for black. We ignore all highly unbalanced positions, which we set to be positions with absolute score above $5$. We also remove all positions with a mate score (one side can force check mate). Furthermore, we only consider positions after $10$ half-moves to avoid oversampling the beginning of the most common openings including the start position which is in every game. Finally, we only consider positions with white to move. This leads to a final data set of roughly $64$ million positions, including duplicates.
 
 A chess position is encoded as a set of $12$ binary matrices $\ten{X}_{\mathrm{piece}}$ of dimensions $8\times 8$. Every binary matrix encodes the positioning of a particular piece by containing a $1$ if the piece is present at the corresponding board position. The $12$ pieces derive from the $6$ types of pieces, namely pawns (\pawn), knights (\knight), bishops (\bishop), queens (\queen) and kings (\king) of two colors, black and white. See \cref{fig:fen2tensor} for a visualization.
 
@@ -1075,13 +1114,13 @@ A chess position is encoded as a set of $12$ binary matrices $\ten{X}_{\mathrm{p
     \caption{\label{fig:fen2tensor}The chess start position and its 3D binary tensor representation, empty entries are $0$.}
 \end{figure}
 
-We assume that $\ten{X}_{\mathrm{piece}}\mid Y = y$ follows an Ising GMLM model \cref{sec:ising_estimation} with different conditional piece predictors being independent. The independence assumption is for the sake of simplicity even though this is clearly not the case in the underlying true distribution. See \cref{sec:chess-discussion} for a short discussion of improvements. By this simplifying assumption we get a mixture model with the log-likelihood
+We assume that $\ten{X}_{\mathrm{piece}}\mid Y = y$ follows an Ising GMLM model \cref{sec:ising_estimation} with different conditional piece predictors being independent. The independence assumption is for the sake of simplicity even though this is clearly not the case in the underlying true distribution. By this simplifying assumption we get a mixture model with the log-likelihood
 \begin{displaymath}
     l_n(\mat{\theta}) = \frac{1}{12}\sum_{\mathrm{piece}}l_n(\mat{\theta}_{\mathrm{piece}})
 \end{displaymath}
 where $l_n(\mat{\theta}_{\mathrm{piece}})$ is the Ising GMLM log-likelihood as in \cref{sec:ising_estimation} for $\ten{X}_{\mathrm{piece}}\mid Y = y$. For every component the same relation to the scores $y$ is modeled via a $2\times 2$ dimensional matrix valued function $\ten{F}_y$ consisting of the monomials $1, y, y^2$, specifically $(\ten{F}_y)_{i j} = y^{i + j - 2}$.
 
-By the raw scale of the data, millions of observations, it is computationally infeasible to compute the gradients on the entire data set. Simply using a computationally manageable subset is not an option. Due to the high dimension on binary data, which is $12$ times a $8\times 8$ for every observation giving a total dimension of $768$. The main issue is that a manageable subset, say one million observations, still leads to a degenerate data set. In our simplified mixture model, the pawns are a specific issue as there are multiple millions of different combinations of the $8$ pawns per color on the $6\times 8$ sub grid the pawns can be positioned. This allown does not allow to take a reasonable sized subset for estimation. The solution is to switch from a classic gradient based optimization to a stochastic version. This means that every gradient update uses a new random subset of the entire data set. Therefore, we draw independent random samples form the data consisting of $64$ million positions. The independence of samples derived from the independence of games, and every sample is drawn from a different game.
+By the raw scale of the data, millions of observations, it is computationally infeasible to compute the gradients on the entire data set. Simply using a computationally manageable subset is not an option. Due to the high dimension on binary data, which is $12$ times a $8\times 8$ for every observation giving a total dimension of $768$. The main issue is that a manageable subset, say one million observations, still leads to a degenerate data set. In our simplified mixture model, the pawns are a specific issue as there are multiple millions of different combinations of the $8$ pawns per color on the $6\times 8$ sub grid the pawns can be positioned. This alone does not allow to take a reasonable sized subset for estimation. The solution is to switch from a classic gradient based optimization to a stochastic version. This means that every gradient update uses a new random subset of the entire data set. Therefore, we draw independent random samples form the data consisting of $64$ million positions. The independence of samples derived from the independence of games, and every sample is drawn from a different game.
 
 \paragraph{Validation:}
 Given the non-linear nature of the reduction, due to the quadratic matrix valued function $\ten{F}_y$ of the score $y$, we use a \emph{generalized additive model}\footnote{using the function \texttt{gam()} from the \texttt{R} package \texttt{mgcv}.} (GAM) to predict position scores from reduced positions. The reduced positions are $48$ dimensional continuous values by combining the $12$ mixture components from the $2\times 2$ matrix valued reductions per piece. The per piece reduction is
@@ -1094,7 +1133,7 @@ which gives the complete $48$ dimensional vectorized reduction by stacking the p
         = (\vec{\ten{R}(\ten{X}_{\text{white pawn}})}, \ldots, \vec{\ten{R}(\ten{X}_{\text{black king}})})
         = \t{\mat{B}}\vec(\ten{X} - \E\ten{X}).
 \end{displaymath}
-The second line encodes all the piece wise reductions in a block diagonal full reduction matrix $\mat{B}$ of dimension $768\times 48$ which is applied to the vectorized 3D tensor $\ten{X}$ combining all the piece components $\ten{X}_{\mathrm{piece}}$ into a single tensor of dimension $8\times 8\times 12$. This is a reduction to $6.25\%$ of the original dimension. The $R^2$ statistic of the GAM fitted on $10^5$ new reduced samples is $R^2_{gam}\approx 42\%$. A linear model on the reduced data achieves $R^2_{lm}\approx 25\%$ which clearly shows the non-linear relation. On the other hand, the static evaluation of the \emph{Schach H\"ornchen}\todo{ref/mention/?!?} engine, given the full position (\emph{not} reduced), achieves an $R^2_{hce}\approx 51\%$. The $42\%$ are reasonably well comparied to $51\%$ of the engine static evaluation which gets the original position and uses chess specific expect knowledge. Features the static evaluation includes, which are expected to be learned by the GMLM mixture model, are; \emph{material} (piece values) and \emph{piece square tables} (PSQT, preferred piece type positions). In addition, the static evaluation includes chess specific features like \emph{king safety}, \emph{pawn structure} or \emph{rooks on open files}. This lets us conclude that the reduction captures most of the relevant features possible, given the oversimplified modeling we performed.
+The second line encodes all the piece wise reductions in a block diagonal full reduction matrix $\mat{B}$ of dimension $768\times 48$ which is applied to the vectorized 3D tensor $\ten{X}$ combining all the piece components $\ten{X}_{\mathrm{piece}}$ into a single tensor of dimension $8\times 8\times 12$. This is a reduction to $6.25\%$ of the original dimension. The $R^2$ statistic of the GAM fitted on $10^5$ new reduced samples is $R^2_{\mathrm{gam}}\approx 46\%$. A linear model on the reduced data achieves $R^2_{\mathrm{lm}}\approx 26\%$ which clearly shows the non-linear relation. On the other hand, the static evaluation of the \emph{Schach H\"ornchen}\footnote{Main authors personal chess engine, used code from the engine is provided in the supplementary material as the \texttt{Rchess} R-package in the \todo{supplementary material}.} engine, given the full position (\emph{not} reduced), achieves an $R^2_{\mathrm{hce}}\approx 52\%$. The $42\%$ are reasonably well compared to $51\%$ of the engine static evaluation which gets the original position and uses chess specific expect knowledge. Features the static evaluation includes, which are expected to be learned by the GMLM mixture model, are; \emph{material} (piece values) and \emph{piece square tables} (PSQT, preferred piece type positions). In addition, the static evaluation includes chess specific features like \emph{king safety}, \emph{pawn structure} or \emph{rooks on open files}. This lets us conclude that the reduction captures most of the relevant features possible, given the oversimplified modeling we performed.
 
 \paragraph{Interpretation:} For a compact interpretation of the estimated reduction we construct PSQTs. To do so we use the linear model from the validation section. Then, we rewrite the combined linear reduction and linear model in terms of PSQTs. Let $\mat{B}$ be the $768\times 48$ full vectorized linear reduction. This is the block diagonal matrix with the $64\times 4$ dimensional per piece reductions $\mat{B}_{\mathrm{piece}} = \mat{\beta}^{\mathrm{piece}}_2\otimes\mat{\beta}^{\mathrm{piece}}_1$. Then, the linear model with coefficients $\mat{b}$ and intercept $a$ on the reduced data is given by
 \begin{equation}\label{eq:chess-lm}
@@ -1223,10 +1262,10 @@ as well as for any tensor $\ten{A}$ of even order $2 r$ and matching square matr
     \begin{displaymath}
         \mat{S}_{(p_1, p_2), (q_1, q_2)} = \mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1}
     \end{displaymath}
-    where $\mat{K}_{p, q}$ is the \emph{commutation matrix} from \textcite[Ch.~11]{MatrixAlgebra-AbadirMagnus2005}, that is the permutation such that $\vec{\t{\mat{A}}} = \mat{K}_{p, q}\vec{\mat{A}}$ for every $p\times q$ dimensional matrix $\mat{A}$.
+    where $\mat{K}_{p, q}$ is the \emph{commutation matrix} from \textcite[Ch.~11]{AbadirMagnus2005}, that is the permutation such that $\vec{\t{\mat{A}}} = \mat{K}_{p, q}\vec{\mat{A}}$ for every $p\times q$ dimensional matrix $\mat{A}$.
 \end{lemma}
 \begin{proof}
-    \textcite[Lemma~7]{SymMatandJacobians-MagnusNeudecker1986} states that
+    \textcite[Lemma~7]{MagnusNeudecker1986} states that
     \begin{align}
         \vec(\mat{A}_2\otimes\mat{A}_1)
             &= (\mat{I}_{q_2}\otimes\mat{K}_{q_1, p_2}\otimes\mat{I}_{p_1})(\vec{\mat{A}_2}\otimes\vec{\mat{A}_1}) \label{eq:MagnusNeudecker1986-vec-kron-identity} \\
@@ -1258,7 +1297,7 @@ as well as for any tensor $\ten{A}$ of even order $2 r$ and matching square matr
 \section{Proofs}\label{app:B}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{proof}[Proof of \cref{thm:sdr}]\label{proof:sdr}
-    A direct implication of \textcite[Theorem~1]{sdr-BuraDuarteForzani2016} is that, under the exponential family \eqref{eq:quadratic-exp-fam} with natural statistic \eqref{eq:t-stat},
+    A direct implication of \textcite[Theorem~1]{BuraDuarteForzani2016} is that, under the exponential family \eqref{eq:quadratic-exp-fam} with natural statistic \eqref{eq:t-stat},
     \begin{displaymath}
         \t{\mat{\alpha}}(\mat{t}(\ten{X}) - \E\mat{t}(\ten{X}))
     \end{displaymath}
@@ -1292,7 +1331,7 @@ which obtains that
         \t{\mat{B}}\vec(\ten{X} - \E\ten{X})
         = \vec\Bigl(\ten{F}_y\mlm_{k = 1}^{r}\mat{\beta}_k\Bigr)
     \end{displaymath}
-is also a sufficient reduction, though not necessarily minimal,     using $\mat{B} = \bigkron_{k = 1}^{r}\mat{\beta}_k$. When the exponential family is full rank, which in our setting amounts to all $\mat{\beta}_j$ being full rank matrices, $j=1,\ldots,r$, then \textcite[Thm~1]{sdr-BuraDuarteForzani2016} also obtains the minimality of the reduction.  
+is also a sufficient reduction, though not necessarily minimal,     using $\mat{B} = \bigkron_{k = 1}^{r}\mat{\beta}_k$. When the exponential family is full rank, which in our setting amounts to all $\mat{\beta}_j$ being full rank matrices, $j=1,\ldots,r$, then \textcite[Thm~1]{BuraDuarteForzani2016} also obtains the minimality of the reduction.  
 \end{proof}
 
 \todo{check proof of Thm 2}
@@ -1313,8 +1352,7 @@ is also a sufficient reduction, though not necessarily minimal,     using $\mat{
             = \mat{T}_2\pinv{\mat{D}_p}\vec\ten{g}_2(\mat{\eta}_{y_i}).
     \end{align*}
     The gradients are related to their derivatives by transposition, $\nabla_{\mat{\eta}_{1{y_i}}}b = \t{\D b(\mat{\eta}_{1{y_i}})}$ and $\nabla_{\mat{\eta}_2}b = \t{\D b(\mat{\eta}_2)}$.
-    Next we provide the differentials of the natural parameter components from \eqref{eq:eta1} and \eqref{eq:eta2} in a quite direct form ,without any further ``simplifications'', because the down-stream computations won't benefit from reexpressing the following
-    \todo{in terms of $m_{\mat{\theta}}$, meaning without the sum, makes it a bit nicer!}
+    Next we provide the differentials of the natural parameter components from \eqref{eq:eta1} and \eqref{eq:eta2} in a quite direct form, without any further ``simplifications'', because the down-stream computations won't benefit from re-expressing the following
     \begin{align*}
         \d\mat{\eta}_{1{y_i}}(\overline{\ten{\eta}})
             &= \d\vec{\overline{\ten{\eta}}}, \\
@@ -1359,7 +1397,7 @@ is also a sufficient reduction, though not necessarily minimal,     using $\mat{
             = \mat{D}_p\pinv{\mat{D}_p}\vec{\ten{g}_2(\mat{\eta}_y)}
             = \vec{\ten{g}_2(\mat{\eta}_y)}
     \end{displaymath}
-    where the last equality holds because $\mat{N}_p = \mat{D}_p\pinv{\mat{D}_p}$ is the symmetrizer matrix from \textcite[Ch. 11]{MatrixAlgebra-AbadirMagnus2005}. For the symmetrizer matrix $\mat{N}_p$ holds $\mat{N}_p\vec{\mat{A}} = \vec{\mat{A}}$ if $\mat{A} = \t{\mat{A}}$, while
+    where the last equality holds because $\mat{N}_p = \mat{D}_p\pinv{\mat{D}_p}$ is the symmetrizer matrix from \textcite[Ch. 11]{AbadirMagnus2005}. For the symmetrizer matrix $\mat{N}_p$ holds $\mat{N}_p\vec{\mat{A}} = \vec{\mat{A}}$ if $\mat{A} = \t{\mat{A}}$, while
     \begin{displaymath}
         \vec{\ten{g}_2(\mat{\eta}_y)} = \vec\E[\ten{X}\circ\ten{X}\mid Y = y] = \vec\E[(\vec{\ten{X}})\t{(\vec{\ten{X}})}\mid Y = y]
     \end{displaymath}
@@ -1404,7 +1442,7 @@ is also a sufficient reduction, though not necessarily minimal,     using $\mat{
             = \{ \mat{\Omega}\in\manifold{K}_{\mat{\Omega}} : (\mat{I}_{p^2} - \mat{P})\vec{\mat{\Omega}} = \mat{0} \}.
     \end{displaymath}
 
-    To show that $\manifold{CK}_{\mat{\Omega}}$ is an embedded submanifold of $\manifold{K}_{\mat{\Omega}}$ we apply the ``Constant-Rank Level Set Theorem'' \textcite[Thm~5.12]{introToSmoothMani-Lee2012} which states (slightly adapted) the following;
+    To show that $\manifold{CK}_{\mat{\Omega}}$ is an embedded submanifold of $\manifold{K}_{\mat{\Omega}}$ we apply the ``Constant-Rank Level Set Theorem'' \textcite[Thm~5.12]{Lee2012} which states (slightly adapted) the following;
     Let $\manifold{A}$, $\manifold{B}$ be smooth manifolds and $F:\manifold{A}\to\manifold{B}$ a smooth map such that $\nabla_{\mat{a}} F$ has constant matrix rank for all $\mat{a}\in\manifold{A}$. Then, for every $\mat{b}\in F(\mat{A})\subseteq\manifold{B}$, the preimage $F^{-1}(\{ \mat{b} \})$ is a smooth embedded submanifold of $\manifold{A}$.
 
     In our setting, we have $F:\manifold{K}_{\mat{\Omega}}\to\mathbb{R}^{p^2}$ defined as $F(\mat{\Omega}) = (\mat{I}_{p^2} - \mat{P})\vec{\mat{\Omega}}$ with gradient $\nabla_{\mat{\Omega}} F = \mat{I}_{p^2} - \mat{P}$ of constant rank. Therefore, $F^{-1}(\{\mat{0}\}) = \manifold{CK}_{\mat{\Omega}}$ is an embedded submanifold of $\manifold{K}_{\mat{\Omega}}$.
@@ -1441,14 +1479,14 @@ is also a sufficient reduction, though not necessarily minimal,     using $\mat{
 \end{proof}
 
 \begin{proof}[Proof of \cref{thm:M-estimator-consistency-on-subsets}]
-    It follows the proof of \textcite[Proposition~2.4]{asymptoticMLE-BuraEtAl2018} with the same assumptions. The only exception is we only require $\Theta$ to be a subset of $\Xi$. This is accounted for  by replacing Lemma~2.3 in \textcite{asymptoticMLE-BuraEtAl2018} with \cref{thm:exists-strong-M-estimator-on-subsets} to obtain the existence of a strong M-estimator on $\Theta$.
+    It follows the proof of \textcite[Proposition~2.4]{BuraEtAl2018} with the same assumptions. The only exception is we only require $\Theta$ to be a subset of $\Xi$. This is accounted for  by replacing Lemma~2.3 in \textcite{BuraEtAl2018} with \cref{thm:exists-strong-M-estimator-on-subsets} to obtain the existence of a strong M-estimator on $\Theta$.
 \end{proof}
 
 
 \begin{proof}[Proof of \cref{thm:M-estimator-asym-normal-on-manifolds}]
-    Let $\varphi:U\to\varphi(U)$ be a coordinate chart\footnote{By \cref{def:manifold}, the chart $\varphi : U\to\varphi(U)$ is bi-continuous, is infinitely often continuously differentiable, and has a continuously differentiable inverse $\varphi^{-1} : \varphi(U) \to U$. Furthermore, the domain $U$ is open according to the trace topology on $\Theta$, that means that their exists an open set $O\subseteq\mathbb{R}^p$ such that $U = \Theta\cap O$.} with $\mat{\theta}_0\in U\subseteq\Theta$. As $\varphi$ is continuous we get with the continuous mapping theorem on metric spaces \textcite[Thm~18.11]{asymStats-van_der_Vaart1998} that $\varphi(\hat{\mat{\theta}}_n)\xrightarrow{p}\varphi(\mat{\theta}_0)$ which implies $P(\varphi(\hat{\mat{\theta}}_n)\in\varphi(U))\xrightarrow{n\to\infty}1$.
+    Let $\varphi:U\to\varphi(U)$ be a coordinate chart\footnote{By \cref{def:manifold}, the chart $\varphi : U\to\varphi(U)$ is bi-continuous, is infinitely often continuously differentiable, and has a continuously differentiable inverse $\varphi^{-1} : \varphi(U) \to U$. Furthermore, the domain $U$ is open according to the trace topology on $\Theta$, that means that their exists an open set $O\subseteq\mathbb{R}^p$ such that $U = \Theta\cap O$.} with $\mat{\theta}_0\in U\subseteq\Theta$. As $\varphi$ is continuous we get with the continuous mapping theorem on metric spaces \textcite[Thm~18.11]{vanderVaart1998} that $\varphi(\hat{\mat{\theta}}_n)\xrightarrow{p}\varphi(\mat{\theta}_0)$ which implies $P(\varphi(\hat{\mat{\theta}}_n)\in\varphi(U))\xrightarrow{n\to\infty}1$.
 
-    The next step is to apply \textcite[Thm~5.23]{asymStats-van_der_Vaart1998} to $\hat{\mat{s}}_n = \varphi(\hat{\mat{\theta}}_n)$. Therefore, assume that $\hat{\mat{s}}_n\in\varphi(U)$. Denote with $\mat{s} = \varphi(\mat{\theta})\in\varphi(U)\subseteq\mathbb{R}^d$ the coordinates of the parameter $\mat{\theta}\in U\subseteq\Theta$ of the $d = \dim(\Theta)$ dimensional manifold $\Theta\subseteq\mathbb{R}^p$. With $\varphi : U\to\varphi(U)$ being bijective, we can express $m_{\mat{\theta}}$ in terms of $\mat{s} = \varphi(\mat{\theta})$ for every $\mat{\theta}\in U$ as $m_{\mat{\theta}} = m_{\varphi^{-1}(\mat{s})}$. Furthermore, denote
+    The next step is to apply \textcite[Thm~5.23]{vanderVaart1998} to $\hat{\mat{s}}_n = \varphi(\hat{\mat{\theta}}_n)$. Therefore, assume that $\hat{\mat{s}}_n\in\varphi(U)$. Denote with $\mat{s} = \varphi(\mat{\theta})\in\varphi(U)\subseteq\mathbb{R}^d$ the coordinates of the parameter $\mat{\theta}\in U\subseteq\Theta$ of the $d = \dim(\Theta)$ dimensional manifold $\Theta\subseteq\mathbb{R}^p$. With $\varphi : U\to\varphi(U)$ being bijective, we can express $m_{\mat{\theta}}$ in terms of $\mat{s} = \varphi(\mat{\theta})$ for every $\mat{\theta}\in U$ as $m_{\mat{\theta}} = m_{\varphi^{-1}(\mat{s})}$. Furthermore, denote
     \begin{displaymath}
         M(\mat{\theta}) = \E[m_{\mat{\theta}}(Z)] \qquad\text{and}\qquad M_{\varphi}(\mat{s}) = \E[m_{\varphi^{-1}(\mat{s})}(Z)] = M(\varphi^{-1}(\mat{s})).
     \end{displaymath}
@@ -1482,7 +1520,7 @@ is also a sufficient reduction, though not necessarily minimal,     using $\mat{
         \overset{(b)}{\leq} u(z) \sup_{\mat{s}\in \overline{V}_{\epsilon / 2}(\mat{s}_0)}\|\nabla\varphi^{-1}(\mat{s})\| \|\mat{s}_1 - \mat{s}_2\|
         =: v(z) \|\mat{s}_1 - \mat{s}_2\|.
     \end{multline*}
-    Here, $(a)$ holds by assumption and $(b)$ is a result of the mean value theorem. Now, $v(z)$ is measurable and square integrable as a scaled version of $u(z)$. Finally, with $\varphi$ being one-to-one, we get that $\hat{\mat{s}}_n = \varphi(\hat{\mat{\theta}}_n)$ is a strong M-estimator for $\mat{s}_0 = \varphi(\mat{\theta}_0)$ of the objective $M_{\varphi}$. Now, we apply \textcite[Thm~5.23]{asymStats-van_der_Vaart1998} to get the asymptotic normality of $\hat{\mat{s}}_n$ as
+    Here, $(a)$ holds by assumption and $(b)$ is a result of the mean value theorem. Now, $v(z)$ is measurable and square integrable as a scaled version of $u(z)$. Finally, with $\varphi$ being one-to-one, we get that $\hat{\mat{s}}_n = \varphi(\hat{\mat{\theta}}_n)$ is a strong M-estimator for $\mat{s}_0 = \varphi(\mat{\theta}_0)$ of the objective $M_{\varphi}$. Now, we apply \textcite[Thm~5.23]{vanderVaart1998} to get the asymptotic normality of $\hat{\mat{s}}_n$ as
     \begin{displaymath}
         \sqrt{n}(\hat{\mat{s}}_n - \mat{s}_0) \xrightarrow{d} \mathcal{N}_{d}(0, \mat{\Sigma}_{\mat{s}_0})
     \end{displaymath}
@@ -1590,7 +1628,7 @@ where $Z_i = (\ten{X}_i, Y_i)$. Using \eqref{eq:eta-to-xi-linear-relation} we ca
 The following are the regularity conditions for the log-likelihood required by \cref{thm:asymptotic-normality-gmlm}.
 
 \begin{condition}\label{cond:differentiable-and-convex}
-    The mapping $\mat{\xi}\mapsto m_{\mat{\xi}}(z)$ is twice continuously differentiable for almost every $z$ and $z\mapsto m_{\mat{\xi}}(z)$ is measurable. Moreover, $\mat{\eta}\mapsto b(\mat{\eta})$ is strictly convex. \todo{Furthermore, for every $\widetilde{\mat{\eta}}$ holds $P(\mat{F}(Y)\mat{\xi} = \widetilde{\mat{\eta}}) < 1$. Do I need this???}
+    The mapping $\mat{\xi}\mapsto m_{\mat{\xi}}(z)$ is twice continuously differentiable for almost every $z$ and $z\mapsto m_{\mat{\xi}}(z)$ is measurable. Moreover, $\mat{\eta}\mapsto b(\mat{\eta})$ is strictly convex. Furthermore, for every $\widetilde{\mat{\eta}}$ holds $P(\mat{F}(Y)\mat{\xi} = \widetilde{\mat{\eta}}) < 1$.
 \end{condition}
 
 \begin{condition}\label{cond:moments}
@@ -1667,7 +1705,7 @@ The following is a technical Lemma used in the proof of \cref{thm:asymptotic-nor
 \begin{proof}[Proof of \cref{thm:asymptotic-normality-gmlm}]
     The proof consists of three parts. First, we show the existence of a consistent strong M-estimator by applying \cref{thm:M-estimator-consistency-on-subsets}. Next, we apply \cref{thm:M-estimator-asym-normal-on-manifolds} to obtain its asymptotic normality. We conclude by computing the missing parts of the asymtotic covariance matrix $\mat{\Sigma}_{\mat{\theta}_0}$ provided by \cref{thm:M-estimator-asym-normal-on-manifolds}.
 
-    We check whether the conditions of \cref{thm:M-estimator-consistency-on-subsets} are satisfied. On $\Xi$, the mapping $\mat{\xi}\mapsto m_{\mat{\xi}}(z) = m_{\mat{\xi}}(\ten{X},y) = \langle \mat{F}(y)\mat{\xi}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\xi})$ is strictly concave for every $z$ because $\mat{\xi}\mapsto\mat{F}(y)\mat{\xi}$ is linear and $b$ is strictly convex by \cref{cond:differentiable-and-convex}. Since $\ten{X} \mid Y$ is distributed according to \eqref{eq:quadratic-exp-fam}, the function $M(\mat{\xi}) = \E m_{\mat{\xi}}(Z)$ is well defined by \cref{cond:moments}. Let $\mat{\xi}_k = (\vec{\overline{\ten{\eta}}_k}, \vec{\mat{B}_k}, \vech{\mat{\Omega}_k})$, and $f_{\mat{\xi}_k}$ be the pdf of $\ten{X} \mid Y$ indexed by $\mat{\xi}_k$, for $k = 1, 2$. If $\mat{\xi}_1\ne \mat{\xi}_2$, then $f_{\mat{\xi}_1} \neq f_{\mat{\xi}_2}$, which obtains that the true $\mat{\theta}_0$ is a unique maximizer of $\mat{\theta}_0\in\Theta\subseteq\Xi$ by applying \textcite[Lemma~5.35]{asymStats-van_der_Vaart1998}. Finally, under \cref{cond:finite-sup-on-compacta}, all assumptions of \cref{thm:M-estimator-consistency-on-subsets} are fulfilled yielding the existence of an consistent strong M-estimator over $\Theta\subseteq\Xi$.
+    We check whether the conditions of \cref{thm:M-estimator-consistency-on-subsets} are satisfied. On $\Xi$, the mapping $\mat{\xi}\mapsto m_{\mat{\xi}}(z) = m_{\mat{\xi}}(\ten{X},y) = \langle \mat{F}(y)\mat{\xi}, \mat{t}(\ten{X}) \rangle - b(\mat{F}(y)\mat{\xi})$ is strictly concave for every $z$ because $\mat{\xi}\mapsto\mat{F}(y)\mat{\xi}$ is linear and $b$ is strictly convex by \cref{cond:differentiable-and-convex}. Since $\ten{X} \mid Y$ is distributed according to \eqref{eq:quadratic-exp-fam}, the function $M(\mat{\xi}) = \E m_{\mat{\xi}}(Z)$ is well defined by \cref{cond:moments}. Let $\mat{\xi}_k = (\vec{\overline{\ten{\eta}}_k}, \vec{\mat{B}_k}, \vech{\mat{\Omega}_k})$, and $f_{\mat{\xi}_k}$ be the pdf of $\ten{X} \mid Y$ indexed by $\mat{\xi}_k$, for $k = 1, 2$. If $\mat{\xi}_1\ne \mat{\xi}_2$, then $f_{\mat{\xi}_1} \neq f_{\mat{\xi}_2}$, which obtains that the true $\mat{\theta}_0$ is a unique maximizer of $\mat{\theta}_0\in\Theta\subseteq\Xi$ by applying \textcite[Lemma~5.35]{vanderVaart1998}. Finally, under \cref{cond:finite-sup-on-compacta}, all assumptions of \cref{thm:M-estimator-consistency-on-subsets} are fulfilled yielding the existence of an consistent strong M-estimator over $\Theta\subseteq\Xi$.
 
     Next, let $\hat{\mat{\theta}}_n$ be a strong M-estimator on $\Theta\subseteq\Xi$, whose existence and consistency was shown in the previous step. Since $z\mapsto m_{\mat{\xi}}(z)$ is measurable for all $\mat{\xi}\in\Xi$, it is also measurable in a neighborhood of $\mat{\theta}_0$. The differentiability of  $\mat{\theta}\mapsto m_{\mat{\theta}}(z)$ is stated in \cref{cond:differentiable-and-convex}. For the Lipschitz condition, let $K\subseteq\Xi$ be a compact neighborhood of $\mat{\theta}_0$, which exists since $\Xi$ is open. Then,
     \begin{align*}
@@ -1711,4 +1749,117 @@ The following is a technical Lemma used in the proof of \cref{thm:asymptotic-nor
 \printbibliography[heading=bibintoc, title={References}]
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
+\appendix
+\section{Examples}\label{app:examples}
+
+\begin{example}[Vectorization]\label{ex:vectorization}
+    Given a matrix
+    \begin{displaymath}
+        \mat{A} = \begin{pmatrix}
+            1 & 4 & 7 \\
+            2 & 5 & 8 \\
+            3 & 6 & 9
+        \end{pmatrix}
+    \end{displaymath}
+    its vectorization is $\vec{\mat{A}} = \t{(1, 2, 3, 4, 5, 6, 7, 8, 9)}$ and its half vectorization $\vech{\mat{A}} = \t{(1, 2, 3, 5, 6, 9)}$. Let a $\ten{A}$ be a tensor of dimension $3\times 3\times 3$ given by
+    \begin{displaymath}
+        \ten{A}_{:,:,1} = \begin{pmatrix}
+            1 & 4 & 7 \\
+            2 & 5 & 8 \\
+            3 & 6 & 9
+        \end{pmatrix},
+        \qquad
+        \ten{A}_{:,:,2} = \begin{pmatrix}
+            10 & 13 & 16 \\
+            11 & 14 & 17 \\
+            12 & 15 & 18
+        \end{pmatrix},
+        \qquad
+        \ten{A}_{:,:,3} = \begin{pmatrix}
+            19 & 22 & 25 \\
+            20 & 23 & 26 \\
+            21 & 24 & 27
+        \end{pmatrix}.
+    \end{displaymath}
+    Then the vectorization of $\ten{A}$ is given by
+    \begin{displaymath}
+        \vec{\ten{A}} = \t{(1, 2, 3, 4, ..., 26, 27)}\in\mathbb{R}^{27}
+    \end{displaymath}
+    while the half vectorization is
+    \begin{displaymath}
+        \vech{\ten{A}} = \t{(1, 2, 3, 5, 6, 9, 11, 12, 15, 21)}\in\mathbb{R}^{10}.
+    \end{displaymath}
+\end{example}
+
+\begin{example}[Matricization]
+    Let $\ten{A}$ be the $3\times 4\times 2$ tensor given by
+    \begin{displaymath}
+        \ten{A}_{:,:,1} = \begin{pmatrix}
+            1 & 4 & 7 & 10 \\
+            2 & 5 & 8 & 11 \\
+            3 & 6 & 9 & 12
+        \end{pmatrix},
+        \ten{A}_{:,:,2} = \begin{pmatrix}
+            13 & 16 & 19 & 22 \\
+            14 & 17 & 20 & 23 \\
+            15 & 18 & 21 & 24
+        \end{pmatrix}.
+    \end{displaymath}
+    Its matricizations are then
+    \begin{gather*}
+        \ten{A}_{(1)} = \begin{pmatrix}
+            1 &  4 &  7 & 10 & 13 & 16 & 19 & 22 \\
+            2 &  5 &  8 & 11 & 14 & 17 & 20 & 23 \\
+            3 &  6 &  9 & 12 & 15 & 18 & 21 & 24
+        \end{pmatrix},
+        \qquad
+        \ten{A}_{(2)} = \begin{pmatrix}
+             1 &  2 &  3 & 13 & 14 & 15 \\
+             4 &  5 &  6 & 16 & 17 & 18 \\
+             7 &  8 &  9 & 19 & 20 & 21 \\
+            10 & 11 & 12 & 22 & 23 & 24
+        \end{pmatrix}, \\
+        \ten{A}_{(3)} = \begin{pmatrix}
+             1 &  2 &  3 &  4 &  5 &  6 &  7 &  8 &  9 &  10 &  11 &  12 \\
+            13 & 14 & 15 & 16 & 17 & 18 & 19 & 20 & 21 &  22 &  23 &  24
+        \end{pmatrix}.
+    \end{gather*}
+\end{example}
+
+% \begin{example}[Symmetrization]
+%     Let $\ten{A}$ be the $3\times 3\times 3$ tensor from \cref{ex:vectorization}, then the symmetrization of $\ten{A}$ is
+%     \begin{align*}
+%         (\sym{\ten{A}})_{:,:,1} &= \frac{1}{6}\begin{pmatrix}
+%              6 & 32 &  58 \\
+%             32 & 58 &  84 \\
+%             58 & 84 & 110
+%         \end{pmatrix}, \\
+%         (\sym{\ten{A}})_{:,:,2} &= \frac{1}{6}\begin{pmatrix}
+%             32 &  58 &  84 \\
+%             58 &  84 & 110 \\
+%             84 & 110 & 136
+%         \end{pmatrix}, \\
+%         (\sym{\ten{A}})_{:,:,3} &= \frac{1}{6}\begin{pmatrix}
+%              58 &  84 & 110 \\
+%              84 & 110 & 136 \\
+%             110 & 136 & 162
+%         \end{pmatrix}.
+%     \end{align*}
+% \end{example}
+
+% \begin{example}[Half Vectorization]
+%     The half vectorization of a square matrix
+%     \begin{displaymath}
+%         \mat{A} = \begin{pmatrix}
+%             1 & 4 & 7 \\
+%             2 & 5 & 8 \\
+%             3 & 6 & 9
+%         \end{pmatrix}
+%     \end{displaymath}
+%     is
+%     \begin{displaymath}
+%         \vech{\mat{A}} = (1, 2, 3, 5, 6, 9).
+%     \end{displaymath}
+% \end{example}
+
 \end{document}
diff --git a/LaTeX/plots/sim-ising.tex b/LaTeX/plots/sim-ising.tex
index 5876a74..e381dfd 100644
--- a/LaTeX/plots/sim-ising.tex
+++ b/LaTeX/plots/sim-ising.tex
@@ -1,7 +1,7 @@
 %%% R code to generate the input data files from corresponding simulation logs
 % R> setwd("~/Work/tensorPredictors")
 % R>
-% R> for (sim.name in c("2a")) {
+% R> for (sim.name in c("2a", "2b", "2c", "2d")) {
 % R>     pattern <- paste0("sim\\_", sim.name, "\\_ising\\-[0-9T]*\\.csv")
 % R>     log.file <- sort(
 % R>         list.files(path = "sim/", pattern = pattern, full.names = TRUE),
@@ -66,7 +66,7 @@
         \addplot[color = clpca] table[x = sample.size, y = dist.subspace.clpca] {aggr-2a-ising.csv};
     \end{axis}
     \node[anchor = base west, yshift = 0.3em] at (sim-2a.north west) {
-        a: small
+        a: linear dependence on $\mathcal{F}_y \equiv y$
     };
     \begin{axis}[
         name=sim-2b,
@@ -83,7 +83,7 @@
         \addplot[color = clpca] table[x = sample.size, y = dist.subspace.clpca] {aggr-2b-ising.csv};
     \end{axis}
     \node[anchor = base west, yshift = 0.3em] at (sim-2b.north west) {
-        b:
+        b: quadratic dependence on $y$
     };
     \begin{axis}[
         name=sim-2c,
@@ -99,7 +99,7 @@
         \addplot[color = clpca] table[x = sample.size, y = dist.subspace.clpca] {aggr-2c-ising.csv};
     \end{axis}
     \node[anchor = base west, yshift = 0.3em] at (sim-2c.north west) {
-        c:
+        c: rank 1 $\boldsymbol{\beta}$'s
     };
 
     \begin{axis}[
@@ -116,83 +116,21 @@
         \addplot[color = clpca] table[x = sample.size, y = dist.subspace.clpca] {aggr-2d-ising.csv};
     \end{axis}
     \node[anchor = base west, yshift = 0.3em] at (sim-2d.north west) {
-        d:
+        d: interaction constraints via $\boldsymbol{\Omega}$'s
     };
 
-    % \begin{axis}[
-    %     name=sim-1b,
-    %     anchor=north west, at={(sim-2a.right of north east)}, xshift=0.1cm,
-    %     xticklabel=\empty,
-    %     ylabel near ticks, yticklabel pos=right
-    % ]
-    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1b-normal.csv};
-    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1b-normal.csv};
-    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1b-normal.csv};
-    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1b-normal.csv};
-    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1b-normal.csv};
-    % \end{axis}
-    % \node[anchor = base west, yshift = 0.3em] at (sim-1b.north west) {
-    %     b: cubic dependence on $y$
-    % };
-
-    % \begin{axis}[
-    %     name=sim-1c,
-    %     anchor=north west, at={(sim-2a.below south west)}, yshift=-.8em,
-    %     xticklabel=\empty
-    % ]
-    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1c-normal.csv};
-    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1c-normal.csv};
-    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1c-normal.csv};
-    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1c-normal.csv};
-    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1c-normal.csv};
-    % \end{axis}
-    % \node[anchor = base west, yshift = 0.3em] at (sim-1c.north west) {
-    %     c: rank $1$ $\boldsymbol{\beta}$'s
-    % };
-
-    % \begin{axis}[
-    %     name=sim-1d,
-    %     anchor=north west, at={(sim-1c.right of north east)}, xshift=0.1cm,
-    %     ylabel near ticks, yticklabel pos=right
-    % ]
-    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1d-normal.csv};
-    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1d-normal.csv};
-    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1d-normal.csv};
-    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1d-normal.csv};
-    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1d-normal.csv};
-    % \end{axis}
-    % \node[anchor = base west, yshift = 0.3em] at (sim-1d.north west) {
-    %     d: tri-diagonal $\boldsymbol{\Omega}$'s
-    % };
-
-    % \begin{axis}[
-    %     name=sim-1e,
-    %     anchor=north west, at={(sim-1c.below south west)}, yshift=-.8em
-    % ]
-    %     \addplot[color = pca]   table[x = sample.size, y = dist.subspace.pca]   {aggr-1e-normal.csv};
-    %     \addplot[color = hopca] table[x = sample.size, y = dist.subspace.hopca] {aggr-1e-normal.csv};
-    %     \addplot[color = tsir]  table[x = sample.size, y = dist.subspace.tsir]  {aggr-1e-normal.csv};
-    %     \addplot[color = mgcca] table[x = sample.size, y = dist.subspace.mgcca] {aggr-1e-normal.csv};
-    %     \addplot[color = gmlm, line width=1pt]  table[x = sample.size, y = dist.subspace.gmlm]  {aggr-1e-normal.csv};
-    % \end{axis}
-    % \node[anchor = base west, yshift = 0.3em] at (sim-1e.north west) {
-    %     e: missspecified
-    % };
-
-
-    \matrix[anchor = west] at (sim-2a.right of east) {
-        \draw[color=gmlm,  legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM};  \\
-        \draw[color=tsir,  legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR};  \\
-        \draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; \\
-        \draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; \\
-        \draw[color=pca,   legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA};   \\
-        \draw[color=lpca,  legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {LPCA};   \\
-        \draw[color=clpca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {CLPCA};   \\
-    };
-
-
     \node[anchor = north] at (current bounding box.south) {Sample Size $n$};
 
+    \matrix[anchor = north] at (current bounding box.south) {
+        \draw[color=gmlm,  legend entry, line width=1pt] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {GMLM};  &
+        \draw[color=tsir,  legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {TSIR};  &
+        \draw[color=mgcca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {MGCCA}; &
+        \draw[color=hopca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {HOPCA}; &
+        \draw[color=pca,   legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {PCA};   &
+        \draw[color=lpca,  legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {LPCA};   &
+        \draw[color=clpca, legend entry] plot coordinates {(0, 0) (.3, 0) (.6, 0)}; & \node[anchor=west] {CLPCA};   \\
+    };
+
     \node[anchor = south, rotate = 90] at (current bounding box.west) {Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$};
     \node[anchor = south, rotate = 270] at (current bounding box.east) {\phantom{Subspace Distance $d(\boldsymbol{B}, \hat{\boldsymbol{B}})$}};