From 10280b6ea9970fc87c833209f2f74ac7cb9cdc13 Mon Sep 17 00:00:00 2001
From: daniel <daniel@kapla.at>
Date: Thu, 6 Oct 2022 14:25:40 +0200
Subject: [PATCH] add: mvbernoulli, wip: tensorPredictors, add: GMLM

---
 LICENSE                                    |  626 ----------
 LaTeX/GMLM.tex                             |  504 ++++++++
 LaTeX/bernoulli.tex                        |  656 ++++++++++
 LaTeX/main.bib                             |  128 +-
 LaTeX/main.tex                             | 1303 ++++++++++++--------
 README.md                                  |    3 -
 mvbernoulli/DESCRIPTION                    |   16 +
 mvbernoulli/NAMESPACE                      |    7 +
 mvbernoulli/R/RcppExports.R                |  162 +++
 mvbernoulli/R/extract.R                    |   12 +
 mvbernoulli/inst/examples/ising_grad.R     |  363 ++++++
 mvbernoulli/inst/examples/ising_sample.R   |   90 ++
 mvbernoulli/inst/examples/ising_sim.R      |  102 ++
 mvbernoulli/inst/include/mvbernoulli.h     |   86 ++
 mvbernoulli/inst/include/threadPool.h      |  139 +++
 mvbernoulli/src/Makevars                   |    1 +
 mvbernoulli/src/RcppExports.cpp            |  252 ++++
 mvbernoulli/src/bit_utils.cpp              |   82 ++
 mvbernoulli/src/bit_utils.h                |   56 +
 mvbernoulli/src/int_utils.cpp              |   51 +
 mvbernoulli/src/int_utils.h                |   35 +
 mvbernoulli/src/ising_model.cpp            |  812 ++++++++++++
 mvbernoulli/src/print.cpp                  |   53 +
 mvbernoulli/src/stats.cpp                  |  115 ++
 mvbernoulli/src/types.h                    |  111 ++
 sim/normal.R                               |  165 +++
 simulations/eeg_sim.R                      |   73 +-
 simulations/kpir_sim.R                     |    8 +-
 tensorPredictors/DESCRIPTION               |    1 +
 tensorPredictors/NAMESPACE                 |   26 +-
 tensorPredictors/R/GMLM.R                  |  316 +++++
 tensorPredictors/R/{hoPCA.R => HOPCA.R}    |    7 +-
 tensorPredictors/R/HOPIR.R                 |  293 +++++
 tensorPredictors/R/HOSVD.R                 |   23 +
 tensorPredictors/R/ICU.R                   |   56 +
 tensorPredictors/R/LSIR.R                  |    2 +-
 tensorPredictors/R/NAGD.R                  |  164 +++
 tensorPredictors/R/RMap.R                  |   21 +
 tensorPredictors/R/dist_kron_norm.R        |   39 +
 tensorPredictors/R/dist_kron_tr.R          |   69 ++
 tensorPredictors/R/dist_subspace.R         |    6 +-
 tensorPredictors/R/exprs_all_equal.R       |   64 +
 tensorPredictors/R/kpir_approx.R           |    2 +-
 tensorPredictors/R/kpir_ls.R               |   67 +-
 tensorPredictors/R/kpir_mle.R              |  110 +-
 tensorPredictors/R/kpir_momentum.R         |    2 +-
 tensorPredictors/R/kpir_new.R              |    2 +-
 tensorPredictors/R/matricize.R             |   93 +-
 tensorPredictors/R/mcov.R                  |   45 +
 tensorPredictors/R/mcrossprod.R            |   23 +-
 tensorPredictors/R/mkm.R                   |   31 +
 tensorPredictors/R/mlm.R                   |   66 +-
 tensorPredictors/R/mtvk.R                  |   63 +
 tensorPredictors/R/num_deriv.R             |   21 +
 tensorPredictors/R/patternMatrices.R       |  139 +++
 tensorPredictors/R/rtensornorm.R           |   13 +-
 tensorPredictors/R/ttm.R                   |   11 +-
 tensorPredictors/R/vech.R                  |   33 +
 tensorPredictors/inst/examples/ICU.R       |   31 +
 tensorPredictors/inst/examples/NAGD.R      |   70 ++
 tensorPredictors/inst/examples/num_deriv.R |   35 +
 tensorPredictors/src/init.c                |   10 +-
 tensorPredictors/src/mtvk.c                |  101 ++
 tensorPredictors/src/ttm.c                 |   70 +-
 64 files changed, 6773 insertions(+), 1363 deletions(-)
 delete mode 100644 LICENSE
 create mode 100644 LaTeX/GMLM.tex
 create mode 100644 LaTeX/bernoulli.tex
 delete mode 100644 README.md
 create mode 100644 mvbernoulli/DESCRIPTION
 create mode 100644 mvbernoulli/NAMESPACE
 create mode 100644 mvbernoulli/R/RcppExports.R
 create mode 100644 mvbernoulli/R/extract.R
 create mode 100644 mvbernoulli/inst/examples/ising_grad.R
 create mode 100644 mvbernoulli/inst/examples/ising_sample.R
 create mode 100644 mvbernoulli/inst/examples/ising_sim.R
 create mode 100644 mvbernoulli/inst/include/mvbernoulli.h
 create mode 100644 mvbernoulli/inst/include/threadPool.h
 create mode 100644 mvbernoulli/src/Makevars
 create mode 100644 mvbernoulli/src/RcppExports.cpp
 create mode 100644 mvbernoulli/src/bit_utils.cpp
 create mode 100644 mvbernoulli/src/bit_utils.h
 create mode 100644 mvbernoulli/src/int_utils.cpp
 create mode 100644 mvbernoulli/src/int_utils.h
 create mode 100644 mvbernoulli/src/ising_model.cpp
 create mode 100644 mvbernoulli/src/print.cpp
 create mode 100644 mvbernoulli/src/stats.cpp
 create mode 100644 mvbernoulli/src/types.h
 create mode 100644 sim/normal.R
 create mode 100644 tensorPredictors/R/GMLM.R
 rename tensorPredictors/R/{hoPCA.R => HOPCA.R} (82%)
 create mode 100644 tensorPredictors/R/HOPIR.R
 create mode 100644 tensorPredictors/R/HOSVD.R
 create mode 100644 tensorPredictors/R/ICU.R
 create mode 100644 tensorPredictors/R/NAGD.R
 create mode 100644 tensorPredictors/R/RMap.R
 create mode 100644 tensorPredictors/R/dist_kron_norm.R
 create mode 100644 tensorPredictors/R/dist_kron_tr.R
 create mode 100644 tensorPredictors/R/exprs_all_equal.R
 create mode 100644 tensorPredictors/R/mcov.R
 create mode 100644 tensorPredictors/R/mkm.R
 create mode 100644 tensorPredictors/R/mtvk.R
 create mode 100644 tensorPredictors/R/num_deriv.R
 create mode 100644 tensorPredictors/R/patternMatrices.R
 create mode 100644 tensorPredictors/R/vech.R
 create mode 100644 tensorPredictors/inst/examples/ICU.R
 create mode 100644 tensorPredictors/inst/examples/NAGD.R
 create mode 100644 tensorPredictors/inst/examples/num_deriv.R
 create mode 100644 tensorPredictors/src/mtvk.c

diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 8015026..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,626 +0,0 @@
-GNU GENERAL PUBLIC LICENSE
-
-Version 3, 29 June 2007
-
-Copyright © 2007 Free Software Foundation, Inc. <http s ://fsf.org/>
-
-Everyone is permitted to copy and distribute verbatim copies of this license
-document, but changing it is not allowed.
-
-Preamble
-
-The GNU General Public License is a free, copyleft license for software and
-other kinds of works.
-
-The licenses for most software and other practical works are designed to take
-away your freedom to share and change the works. By contrast, the GNU General
-Public License is intended to guarantee your freedom to share and change all
-versions of a program--to make sure it remains free software for all its users.
-We, the Free Software Foundation, use the GNU General Public License for most
-of our software; it applies also to any other work released this way by its
-authors. You can apply it to your programs, too.
-
-When we speak of free software, we are referring to freedom, not price. Our
-General Public Licenses are designed to make sure that you have the freedom
-to distribute copies of free software (and charge for them if you wish), that
-you receive source code or can get it if you want it, that you can change
-the software or use pieces of it in new free programs, and that you know you
-can do these things.
-
-To protect your rights, we need to prevent others from denying you these rights
-or asking you to surrender the rights. Therefore, you have certain responsibilities
-if you distribute copies of the software, or if you modify it: responsibilities
-to respect the freedom of others.
-
-For example, if you distribute copies of such a program, whether gratis or
-for a fee, you must pass on to the recipients the same freedoms that you received.
-You must make sure that they, too, receive or can get the source code. And
-you must show them these terms so they know their rights.
-
-Developers that use the GNU GPL protect your rights with two steps: (1) assert
-copyright on the software, and (2) offer you this License giving you legal
-permission to copy, distribute and/or modify it.
-
-For the developers' and authors' protection, the GPL clearly explains that
-there is no warranty for this free software. For both users' and authors'
-sake, the GPL requires that modified versions be marked as changed, so that
-their problems will not be attributed erroneously to authors of previous versions.
-
-Some devices are designed to deny users access to install or run modified
-versions of the software inside them, although the manufacturer can do so.
-This is fundamentally incompatible with the aim of protecting users' freedom
-to change the software. The systematic pattern of such abuse occurs in the
-area of products for individuals to use, which is precisely where it is most
-unacceptable. Therefore, we have designed this version of the GPL to prohibit
-the practice for those products. If such problems arise substantially in other
-domains, we stand ready to extend this provision to those domains in future
-versions of the GPL, as needed to protect the freedom of users.
-
-Finally, every program is threatened constantly by software patents. States
-should not allow patents to restrict development and use of software on general-purpose
-computers, but in those that do, we wish to avoid the special danger that
-patents applied to a free program could make it effectively proprietary. To
-prevent this, the GPL assures that patents cannot be used to render the program
-non-free.
-
-The precise terms and conditions for copying, distribution and modification
-follow.
-
-TERMS AND CONDITIONS
-
-   0. Definitions.
-
-   "This License" refers to version 3 of the GNU General Public License.
-
-"Copyright" also means copyright-like laws that apply to other kinds of works,
-such as semiconductor masks.
-
-"The Program" refers to any copyrightable work licensed under this License.
-Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals
-or organizations.
-
-To "modify" a work means to copy from or adapt all or part of the work in
-a fashion requiring copyright permission, other than the making of an exact
-copy. The resulting work is called a "modified version" of the earlier work
-or a work "based on" the earlier work.
-
-A "covered work" means either the unmodified Program or a work based on the
-Program.
-
-To "propagate" a work means to do anything with it that, without permission,
-would make you directly or secondarily liable for infringement under applicable
-copyright law, except executing it on a computer or modifying a private copy.
-Propagation includes copying, distribution (with or without modification),
-making available to the public, and in some countries other activities as
-well.
-
-To "convey" a work means any kind of propagation that enables other parties
-to make or receive copies. Mere interaction with a user through a computer
-network, with no transfer of a copy, is not conveying.
-
-An interactive user interface displays "Appropriate Legal Notices" to the
-extent that it includes a convenient and prominently visible feature that
-(1) displays an appropriate copyright notice, and (2) tells the user that
-there is no warranty for the work (except to the extent that warranties are
-provided), that licensees may convey the work under this License, and how
-to view a copy of this License. If the interface presents a list of user commands
-or options, such as a menu, a prominent item in the list meets this criterion.
-
-   1. Source Code.
-
-The "source code" for a work means the preferred form of the work for making
-modifications to it. "Object code" means any non-source form of a work.
-
-A "Standard Interface" means an interface that either is an official standard
-defined by a recognized standards body, or, in the case of interfaces specified
-for a particular programming language, one that is widely used among developers
-working in that language.
-
-The "System Libraries" of an executable work include anything, other than
-the work as a whole, that (a) is included in the normal form of packaging
-a Major Component, but which is not part of that Major Component, and (b)
-serves only to enable use of the work with that Major Component, or to implement
-a Standard Interface for which an implementation is available to the public
-in source code form. A "Major Component", in this context, means a major essential
-component (kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to produce
-the work, or an object code interpreter used to run it.
-
-The "Corresponding Source" for a work in object code form means all the source
-code needed to generate, install, and (for an executable work) run the object
-code and to modify the work, including scripts to control those activities.
-However, it does not include the work's System Libraries, or general-purpose
-tools or generally available free programs which are used unmodified in performing
-those activities but which are not part of the work. For example, Corresponding
-Source includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically linked
-subprograms that the work is specifically designed to require, such as by
-intimate data communication or control flow between those subprograms and
-other parts of the work.
-
-The Corresponding Source need not include anything that users can regenerate
-automatically from other parts of the Corresponding Source.
-
-   The Corresponding Source for a work in source code form is that same work.
-
-   2. Basic Permissions.
-
-All rights granted under this License are granted for the term of copyright
-on the Program, and are irrevocable provided the stated conditions are met.
-This License explicitly affirms your unlimited permission to run the unmodified
-Program. The output from running a covered work is covered by this License
-only if the output, given its content, constitutes a covered work. This License
-acknowledges your rights of fair use or other equivalent, as provided by copyright
-law.
-
-You may make, run and propagate covered works that you do not convey, without
-conditions so long as your license otherwise remains in force. You may convey
-covered works to others for the sole purpose of having them make modifications
-exclusively for you, or provide you with facilities for running those works,
-provided that you comply with the terms of this License in conveying all material
-for which you do not control copyright. Those thus making or running the covered
-works for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of your copyrighted
-material outside their relationship with you.
-
-Conveying under any other circumstances is permitted solely under the conditions
-stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
-
-   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-No covered work shall be deemed part of an effective technological measure
-under any applicable law fulfilling obligations under article 11 of the WIPO
-copyright treaty adopted on 20 December 1996, or similar laws prohibiting
-or restricting circumvention of such measures.
-
-When you convey a covered work, you waive any legal power to forbid circumvention
-of technological measures to the extent such circumvention is effected by
-exercising rights under this License with respect to the covered work, and
-you disclaim any intention to limit operation or modification of the work
-as a means of enforcing, against the work's users, your or third parties'
-legal rights to forbid circumvention of technological measures.
-
-   4. Conveying Verbatim Copies.
-
-You may convey verbatim copies of the Program's source code as you receive
-it, in any medium, provided that you conspicuously and appropriately publish
-on each copy an appropriate copyright notice; keep intact all notices stating
-that this License and any non-permissive terms added in accord with section
-7 apply to the code; keep intact all notices of the absence of any warranty;
-and give all recipients a copy of this License along with the Program.
-
-You may charge any price or no price for each copy that you convey, and you
-may offer support or warranty protection for a fee.
-
-   5. Conveying Modified Source Versions.
-
-You may convey a work based on the Program, or the modifications to produce
-it from the Program, in the form of source code under the terms of section
-4, provided that you also meet all of these conditions:
-
-a) The work must carry prominent notices stating that you modified it, and
-giving a relevant date.
-
-b) The work must carry prominent notices stating that it is released under
-this License and any conditions added under section 7. This requirement modifies
-the requirement in section 4 to "keep intact all notices".
-
-c) You must license the entire work, as a whole, under this License to anyone
-who comes into possession of a copy. This License will therefore apply, along
-with any applicable section 7 additional terms, to the whole of the work,
-and all its parts, regardless of how they are packaged. This License gives
-no permission to license the work in any other way, but it does not invalidate
-such permission if you have separately received it.
-
-d) If the work has interactive user interfaces, each must display Appropriate
-Legal Notices; however, if the Program has interactive interfaces that do
-not display Appropriate Legal Notices, your work need not make them do so.
-
-A compilation of a covered work with other separate and independent works,
-which are not by their nature extensions of the covered work, and which are
-not combined with it such as to form a larger program, in or on a volume of
-a storage or distribution medium, is called an "aggregate" if the compilation
-and its resulting copyright are not used to limit the access or legal rights
-of the compilation's users beyond what the individual works permit. Inclusion
-of a covered work in an aggregate does not cause this License to apply to
-the other parts of the aggregate.
-
-   6. Conveying Non-Source Forms.
-
-You may convey a covered work in object code form under the terms of sections
-4 and 5, provided that you also convey the machine-readable Corresponding
-Source under the terms of this License, in one of these ways:
-
-a) Convey the object code in, or embodied in, a physical product (including
-a physical distribution medium), accompanied by the Corresponding Source fixed
-on a durable physical medium customarily used for software interchange.
-
-b) Convey the object code in, or embodied in, a physical product (including
-a physical distribution medium), accompanied by a written offer, valid for
-at least three years and valid for as long as you offer spare parts or customer
-support for that product model, to give anyone who possesses the object code
-either (1) a copy of the Corresponding Source for all the software in the
-product that is covered by this License, on a durable physical medium customarily
-used for software interchange, for a price no more than your reasonable cost
-of physically performing this conveying of source, or (2) access to copy the
-Corresponding Source from a network server at no charge.
-
-c) Convey individual copies of the object code with a copy of the written
-offer to provide the Corresponding Source. This alternative is allowed only
-occasionally and noncommercially, and only if you received the object code
-with such an offer, in accord with subsection 6b.
-
-d) Convey the object code by offering access from a designated place (gratis
-or for a charge), and offer equivalent access to the Corresponding Source
-in the same way through the same place at no further charge. You need not
-require recipients to copy the Corresponding Source along with the object
-code. If the place to copy the object code is a network server, the Corresponding
-Source may be on a different server (operated by you or a third party) that
-supports equivalent copying facilities, provided you maintain clear directions
-next to the object code saying where to find the Corresponding Source. Regardless
-of what server hosts the Corresponding Source, you remain obligated to ensure
-that it is available for as long as needed to satisfy these requirements.
-
-e) Convey the object code using peer-to-peer transmission, provided you inform
-other peers where the object code and Corresponding Source of the work are
-being offered to the general public at no charge under subsection 6d.
-
-A separable portion of the object code, whose source code is excluded from
-the Corresponding Source as a System Library, need not be included in conveying
-the object code work.
-
-A "User Product" is either (1) a "consumer product", which means any tangible
-personal property which is normally used for personal, family, or household
-purposes, or (2) anything designed or sold for incorporation into a dwelling.
-In determining whether a product is a consumer product, doubtful cases shall
-be resolved in favor of coverage. For a particular product received by a particular
-user, "normally used" refers to a typical or common use of that class of product,
-regardless of the status of the particular user or of the way in which the
-particular user actually uses, or expects or is expected to use, the product.
-A product is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent the
-only significant mode of use of the product.
-
-"Installation Information" for a User Product means any methods, procedures,
-authorization keys, or other information required to install and execute modified
-versions of a covered work in that User Product from a modified version of
-its Corresponding Source. The information must suffice to ensure that the
-continued functioning of the modified object code is in no case prevented
-or interfered with solely because modification has been made.
-
-If you convey an object code work under this section in, or with, or specifically
-for use in, a User Product, and the conveying occurs as part of a transaction
-in which the right of possession and use of the User Product is transferred
-to the recipient in perpetuity or for a fixed term (regardless of how the
-transaction is characterized), the Corresponding Source conveyed under this
-section must be accompanied by the Installation Information. But this requirement
-does not apply if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has been installed
-in ROM).
-
-The requirement to provide Installation Information does not include a requirement
-to continue to provide support service, warranty, or updates for a work that
-has been modified or installed by the recipient, or for the User Product in
-which it has been modified or installed. Access to a network may be denied
-when the modification itself materially and adversely affects the operation
-of the network or violates the rules and protocols for communication across
-the network.
-
-Corresponding Source conveyed, and Installation Information provided, in accord
-with this section must be in a format that is publicly documented (and with
-an implementation available to the public in source code form), and must require
-no special password or key for unpacking, reading or copying.
-
-   7. Additional Terms.
-
-"Additional permissions" are terms that supplement the terms of this License
-by making exceptions from one or more of its conditions. Additional permissions
-that are applicable to the entire Program shall be treated as though they
-were included in this License, to the extent that they are valid under applicable
-law. If additional permissions apply only to part of the Program, that part
-may be used separately under those permissions, but the entire Program remains
-governed by this License without regard to the additional permissions.
-
-When you convey a copy of a covered work, you may at your option remove any
-additional permissions from that copy, or from any part of it. (Additional
-permissions may be written to require their own removal in certain cases when
-you modify the work.) You may place additional permissions on material, added
-by you to a covered work, for which you have or can give appropriate copyright
-permission.
-
-Notwithstanding any other provision of this License, for material you add
-to a covered work, you may (if authorized by the copyright holders of that
-material) supplement the terms of this License with terms:
-
-a) Disclaiming warranty or limiting liability differently from the terms of
-sections 15 and 16 of this License; or
-
-b) Requiring preservation of specified reasonable legal notices or author
-attributions in that material or in the Appropriate Legal Notices displayed
-by works containing it; or
-
-c) Prohibiting misrepresentation of the origin of that material, or requiring
-that modified versions of such material be marked in reasonable ways as different
-from the original version; or
-
-d) Limiting the use for publicity purposes of names of licensors or authors
-of the material; or
-
-e) Declining to grant rights under trademark law for use of some trade names,
-trademarks, or service marks; or
-
-f) Requiring indemnification of licensors and authors of that material by
-anyone who conveys the material (or modified versions of it) with contractual
-assumptions of liability to the recipient, for any liability that these contractual
-assumptions directly impose on those licensors and authors.
-
-All other non-permissive additional terms are considered "further restrictions"
-within the meaning of section 10. If the Program as you received it, or any
-part of it, contains a notice stating that it is governed by this License
-along with a term that is a further restriction, you may remove that term.
-If a license document contains a further restriction but permits relicensing
-or conveying under this License, you may add to a covered work material governed
-by the terms of that license document, provided that the further restriction
-does not survive such relicensing or conveying.
-
-If you add terms to a covered work in accord with this section, you must place,
-in the relevant source files, a statement of the additional terms that apply
-to those files, or a notice indicating where to find the applicable terms.
-
-Additional terms, permissive or non-permissive, may be stated in the form
-of a separately written license, or stated as exceptions; the above requirements
-apply either way.
-
-   8. Termination.
-
-You may not propagate or modify a covered work except as expressly provided
-under this License. Any attempt otherwise to propagate or modify it is void,
-and will automatically terminate your rights under this License (including
-any patent licenses granted under the third paragraph of section 11).
-
-However, if you cease all violation of this License, then your license from
-a particular copyright holder is reinstated (a) provisionally, unless and
-until the copyright holder explicitly and finally terminates your license,
-and (b) permanently, if the copyright holder fails to notify you of the violation
-by some reasonable means prior to 60 days after the cessation.
-
-Moreover, your license from a particular copyright holder is reinstated permanently
-if the copyright holder notifies you of the violation by some reasonable means,
-this is the first time you have received notice of violation of this License
-(for any work) from that copyright holder, and you cure the violation prior
-to 30 days after your receipt of the notice.
-
-Termination of your rights under this section does not terminate the licenses
-of parties who have received copies or rights from you under this License.
-If your rights have been terminated and not permanently reinstated, you do
-not qualify to receive new licenses for the same material under section 10.
-
-   9. Acceptance Not Required for Having Copies.
-
-You are not required to accept this License in order to receive or run a copy
-of the Program. Ancillary propagation of a covered work occurring solely as
-a consequence of using peer-to-peer transmission to receive a copy likewise
-does not require acceptance. However, nothing other than this License grants
-you permission to propagate or modify any covered work. These actions infringe
-copyright if you do not accept this License. Therefore, by modifying or propagating
-a covered work, you indicate your acceptance of this License to do so.
-
-   10. Automatic Licensing of Downstream Recipients.
-
-Each time you convey a covered work, the recipient automatically receives
-a license from the original licensors, to run, modify and propagate that work,
-subject to this License. You are not responsible for enforcing compliance
-by third parties with this License.
-
-An "entity transaction" is a transaction transferring control of an organization,
-or substantially all assets of one, or subdividing an organization, or merging
-organizations. If propagation of a covered work results from an entity transaction,
-each party to that transaction who receives a copy of the work also receives
-whatever licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the Corresponding
-Source of the work from the predecessor in interest, if the predecessor has
-it or can get it with reasonable efforts.
-
-You may not impose any further restrictions on the exercise of the rights
-granted or affirmed under this License. For example, you may not impose a
-license fee, royalty, or other charge for exercise of rights granted under
-this License, and you may not initiate litigation (including a cross-claim
-or counterclaim in a lawsuit) alleging that any patent claim is infringed
-by making, using, selling, offering for sale, or importing the Program or
-any portion of it.
-
-   11. Patents.
-
-A "contributor" is a copyright holder who authorizes use under this License
-of the Program or a work on which the Program is based. The work thus licensed
-is called the contributor's "contributor version".
-
-A contributor's "essential patent claims" are all patent claims owned or controlled
-by the contributor, whether already acquired or hereafter acquired, that would
-be infringed by some manner, permitted by this License, of making, using,
-or selling its contributor version, but do not include claims that would be
-infringed only as a consequence of further modification of the contributor
-version. For purposes of this definition, "control" includes the right to
-grant patent sublicenses in a manner consistent with the requirements of this
-License.
-
-Each contributor grants you a non-exclusive, worldwide, royalty-free patent
-license under the contributor's essential patent claims, to make, use, sell,
-offer for sale, import and otherwise run, modify and propagate the contents
-of its contributor version.
-
-In the following three paragraphs, a "patent license" is any express agreement
-or commitment, however denominated, not to enforce a patent (such as an express
-permission to practice a patent or covenant not to sue for patent infringement).
-To "grant" such a patent license to a party means to make such an agreement
-or commitment not to enforce a patent against the party.
-
-If you convey a covered work, knowingly relying on a patent license, and the
-Corresponding Source of the work is not available for anyone to copy, free
-of charge and under the terms of this License, through a publicly available
-network server or other readily accessible means, then you must either (1)
-cause the Corresponding Source to be so available, or (2) arrange to deprive
-yourself of the benefit of the patent license for this particular work, or
-(3) arrange, in a manner consistent with the requirements of this License,
-to extend the patent license to downstream recipients. "Knowingly relying"
-means you have actual knowledge that, but for the patent license, your conveying
-the covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that country
-that you have reason to believe are valid.
-
-If, pursuant to or in connection with a single transaction or arrangement,
-you convey, or propagate by procuring conveyance of, a covered work, and grant
-a patent license to some of the parties receiving the covered work authorizing
-them to use, propagate, modify or convey a specific copy of the covered work,
-then the patent license you grant is automatically extended to all recipients
-of the covered work and works based on it.
-
-A patent license is "discriminatory" if it does not include within the scope
-of its coverage, prohibits the exercise of, or is conditioned on the non-exercise
-of one or more of the rights that are specifically granted under this License.
-You may not convey a covered work if you are a party to an arrangement with
-a third party that is in the business of distributing software, under which
-you make payment to the third party based on the extent of your activity of
-conveying the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory patent
-license (a) in connection with copies of the covered work conveyed by you
-(or copies made from those copies), or (b) primarily for and in connection
-with specific products or compilations that contain the covered work, unless
-you entered into that arrangement, or that patent license was granted, prior
-to 28 March 2007.
-
-Nothing in this License shall be construed as excluding or limiting any implied
-license or other defenses to infringement that may otherwise be available
-to you under applicable patent law.
-
-   12. No Surrender of Others' Freedom.
-
-If conditions are imposed on you (whether by court order, agreement or otherwise)
-that contradict the conditions of this License, they do not excuse you from
-the conditions of this License. If you cannot convey a covered work so as
-to satisfy simultaneously your obligations under this License and any other
-pertinent obligations, then as a consequence you may not convey it at all.
-For example, if you agree to terms that obligate you to collect a royalty
-for further conveying from those to whom you convey the Program, the only
-way you could satisfy both those terms and this License would be to refrain
-entirely from conveying the Program.
-
-   13. Use with the GNU Affero General Public License.
-
-Notwithstanding any other provision of this License, you have permission to
-link or combine any covered work with a work licensed under version 3 of the
-GNU Affero General Public License into a single combined work, and to convey
-the resulting work. The terms of this License will continue to apply to the
-part which is the covered work, but the special requirements of the GNU Affero
-General Public License, section 13, concerning interaction through a network
-will apply to the combination as such.
-
-   14. Revised Versions of this License.
-
-The Free Software Foundation may publish revised and/or new versions of the
-GNU General Public License from time to time. Such new versions will be similar
-in spirit to the present version, but may differ in detail to address new
-problems or concerns.
-
-Each version is given a distinguishing version number. If the Program specifies
-that a certain numbered version of the GNU General Public License "or any
-later version" applies to it, you have the option of following the terms and
-conditions either of that numbered version or of any later version published
-by the Free Software Foundation. If the Program does not specify a version
-number of the GNU General Public License, you may choose any version ever
-published by the Free Software Foundation.
-
-If the Program specifies that a proxy can decide which future versions of
-the GNU General Public License can be used, that proxy's public statement
-of acceptance of a version permanently authorizes you to choose that version
-for the Program.
-
-Later license versions may give you additional or different permissions. However,
-no additional obligations are imposed on any author or copyright holder as
-a result of your choosing to follow a later version.
-
-   15. Disclaimer of Warranty.
-
-THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE
-LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
-OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
-EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
-TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM
-PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
-CORRECTION.
-
-   16. Limitation of Liability.
-
-IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL
-ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM
-AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL,
-INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO
-USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED
-INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE
-PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER
-PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-   17. Interpretation of Sections 15 and 16.
-
-If the disclaimer of warranty and limitation of liability provided above cannot
-be given local legal effect according to their terms, reviewing courts shall
-apply local law that most closely approximates an absolute waiver of all civil
-liability in connection with the Program, unless a warranty or assumption
-of liability accompanies a copy of the Program in return for a fee. END OF
-TERMS AND CONDITIONS
-
-How to Apply These Terms to Your New Programs
-
-If you develop a new program, and you want it to be of the greatest possible
-use to the public, the best way to achieve this is to make it free software
-which everyone can redistribute and change under these terms.
-
-To do so, attach the following notices to the program. It is safest to attach
-them to the start of each source file to most effectively state the exclusion
-of warranty; and each file should have at least the "copyright" line and a
-pointer to where the full notice is found.
-
-<one line to give the program's name and a brief idea of what it does.>
-
-Copyright (C) <year> <name of author>
-
-This program is free software: you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation, either version 3 of the License, or (at your option) any later
-version.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program. If not, see <http s ://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-If the program does terminal interaction, make it output a short notice like
-this when it starts in an interactive mode:
-
-<program> Copyright (C) <year> <name of author>
-
-This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-
-This is free software, and you are welcome to redistribute it under certain
-conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License. Of course, your program's commands might
-be different; for a GUI interface, you would use an "about box".
-
-You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary. For
-more information on this, and how to apply and follow the GNU GPL, see <http
-s ://www.gnu.org/licenses/>.
-
-The GNU General Public License does not permit incorporating your program
-into proprietary programs. If your program is a subroutine library, you may
-consider it more useful to permit linking proprietary applications with the
-library. If this is what you want to do, use the GNU Lesser General Public
-License instead of this License. But first, please read <http s ://www.gnu.org/
-licenses /why-not-lgpl.html>.
diff --git a/LaTeX/GMLM.tex b/LaTeX/GMLM.tex
new file mode 100644
index 0000000..9f6e880
--- /dev/null
+++ b/LaTeX/GMLM.tex
@@ -0,0 +1,504 @@
+\documentclass[a4paper, 10pt]{article}
+
+\usepackage[utf8]{inputenc}
+\usepackage[T1]{fontenc}
+\usepackage{fullpage}
+\usepackage{amsmath, amssymb, amstext, amsthm}
+\usepackage{bm}                         % \boldsymbol and italic corrections, ...
+\usepackage[pdftex]{hyperref}
+\usepackage{makeidx}                    % Index (Symbols, Names, ...)
+\usepackage{xcolor, graphicx}           % colors and including images
+\usepackage{tikz}
+\usetikzlibrary{calc}
+\usepackage[
+    % backend=bibtex,
+    style=authoryear-comp
+]{biblatex}
+\usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms
+
+% Document meta into
+\title{Generalized Multi-Linear Model for the Quadratic Exponential Family}
+\author{Daniel Kapla}
+\date{\today}
+% Set PDF title, author and creator.
+\AtBeginDocument{
+    \hypersetup{
+        pdftitle = {Generalized Multi-Linear Model for the Quadratic Exponential Family},
+        pdfauthor = {Daniel Kapla},
+        pdfcreator = {\pdftexbanner}
+    }
+}
+
+\makeindex
+
+% Bibliography resource(s)
+\addbibresource{main.bib}
+
+% Setup environments
+% Theorem, Lemma
+\theoremstyle{plain}
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}{Lemma}
+\newtheorem{example}{Example}
+% Definition
+\theoremstyle{definition}
+\newtheorem{defn}{Definition}
+% Remark
+\theoremstyle{remark}
+\newtheorem{remark}{Remark}
+
+% Define math macros
+\newcommand{\mat}[1]{\boldsymbol{#1}}
+\newcommand{\ten}[1]{\mathcal{#1}}
+\renewcommand{\vec}{\operatorname{vec}}
+\newcommand{\unvec}{\operatorname{vec^{-1}}}
+\newcommand{\reshape}[1]{\operatorname{reshape}_{#1}}
+\newcommand{\vech}{\operatorname{vech}}
+\newcommand{\rank}{\operatorname{rank}}
+\newcommand{\diag}{\operatorname{diag}}
+\DeclareMathOperator{\tr}{tr}
+\DeclareMathOperator{\var}{Var}
+\DeclareMathOperator{\cov}{Cov}
+\DeclareMathOperator{\Span}{Span}
+\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
+% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
+\DeclareMathOperator*{\argmin}{{arg\,min}}
+\DeclareMathOperator*{\argmax}{{arg\,max}}
+\newcommand{\D}{\textnormal{D}}         % derivative
+\renewcommand{\d}{\textnormal{d}}       % differential
+\renewcommand{\t}[1]{{#1^{\prime}}}     % matrix transpose
+\newcommand{\pinv}[1]{{#1^{\dagger}}}   % `Moore-Penrose pseudoinverse`
+\newcommand{\invlink}{\widetilde{\mat{g}}}
+
+\newcommand{\todo}[1]{{\color{red}TODO: #1}}
+\newcommand{\effie}[1]{{\color{blue}Effie: #1}}
+
+% Pseudo Code Commands
+\newcommand{\algorithmicbreak}{\textbf{break}}
+\newcommand{\Break}{\State \algorithmicbreak}
+
+\begin{document}
+
+\maketitle
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%                                 Abstract                                 %%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{abstract}
+    We propose a method for sufficient dimension reduction of Tensor-valued predictor (multi dimensional arrays) for regression or classification. We assume an Quadratic Exponential Family for a Generalized Linear Model in an inverse regression setting where the relation via a link is of a multi-linear nature.
+    Using a multi-linear relation allows to perform per-axis reductions which reduces the total number of parameters drastically for higher order Tensor-valued predictors. Under the Exponential Family we derive maximum likelihood estimates for the multi-linear sufficient dimension reduction of the Tensor-valued predictors. Furthermore, we provide an estimation algorithm which utilizes the Tensor structure allowing efficient implementations. The performance of the method is illustrated via simulations and real world examples are provided.
+\end{abstract}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                \section{Quadratic Exponential Family GLM}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{description}
+    \item[Distribution] 
+        \begin{displaymath}
+            f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = h(\ten{X})\exp(\t{\mat{\eta}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y))
+        \end{displaymath}
+    \item[(inverse) link] 
+        \begin{displaymath}
+            \invlink(\mat{\eta}(\mat{\theta}_y)) = \E_{\mat{\theta}_y}[\mat{t}(\ten{X})\mid Y = y]
+        \end{displaymath}
+    \item[(multi) linear predictor] For
+        \begin{displaymath}
+            \mat{\eta}_y = \mat{\eta}(\mat{\theta}_y) = \begin{pmatrix}
+                \mat{\eta}_1(\mat{\theta}_y) \\
+                \mat{\eta}_2(\mat{\theta}_y)
+            \end{pmatrix},\qquad
+            \mat{t}(\ten{X}) = \begin{pmatrix}
+                \mat{t}_1(\ten{X}) \\
+                \mat{t}_2(\ten{X})
+            \end{pmatrix} = \begin{pmatrix}
+                \vec{\ten{X}} \\
+                \vec{\ten{X}}\otimes\vec{\ten{X}}
+            \end{pmatrix}
+        \end{displaymath}
+        where
+        \begin{align*}
+            \mat{\eta}_1(\mat{\theta}_y) &= \mat{\eta}_{y,1} = c_1 \vec(\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k) \\
+            \mat{\eta}_2(\mat{\theta}_y) &= \mat{\eta}_{y,2} = c_2 \vec{\bigotimes_{k = r}^1 \mat{\Omega}_k}
+        \end{align*}
+        with model parameters $\overline{\ten{\eta}}_1, \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{\Omega}_1, ..., \mat{\Omega}_r$ where $\overline{\ten{\eta}}_1$ is a $p_1\times ... \times p_r$ tensor, $\mat{\alpha}_j$ are $p_j\times q_j$ unconstrained matrices and $\mat{\Omega}_j$ are symmetric $p_j\times p_j$ matrices for each of the $j = 1, ..., r$ modes. Finally, $c_1$ and $c_2$ are known constants simplifying modeling for specific distributions.
+\end{description}
+% With that approach we get
+% \begin{displaymath}
+%     \t{\mat{\eta}(\mat{\theta}_y)}\mat{t}(\ten{X}) = \t{\mat{\eta}_{y,1}}\mat{t}_1(\ten{X}) + \t{\mat{\eta}_{y,2}}\mat{t}_2(\ten{X}) = \langle\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k, \ten{X} \rangle + \langle\ten{X}\times_{k\in[r]}\mat{\Omega}_k, \ten{X} \rangle.
+% \end{displaymath}
+
+\begin{theorem}[Log-Likelihood and Score]
+    For $n$ i.i.d. observations $(\ten{X}_i, y_i), i = 1, ..., n$ the log-likelihood has the form
+    \begin{displaymath}
+        l(\mat{\eta}_y) = \sum_{i = 1}^n(\log h(\ten{X}_i) + c_1\langle\overline{\ten{\eta}}_1 + \ten{F}_{y_i}\times_{k\in[r]}\mat{\alpha}_k, \ten{X}_i \rangle + c_2\langle\ten{X}_i\times_{k\in[r]}\mat{\Omega}_k, \ten{X}_i \rangle - b(\mat{\eta}_{y_i})).
+    \end{displaymath}
+    % The MLE estimate for the intercept term $\overline{\ten{\eta}}_1$ is
+    % \begin{displaymath}
+    %     \widehat{\ten{\eta}}_1 = \frac{1}{n}\sum_{i = 1}^n \ten{X}_i
+    % \end{displaymath}
+    The gradients with respect to the GLM parameters $\overline{\ten{\eta}}_1$, $\mat{\alpha}_j$ and $\mat{\Omega}_j$ for $j = 1, ..., r$ are given by
+    \begin{align*}
+        \nabla_{\overline{\ten{\eta}}_1}l &= c_1\sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i})), \\
+        \nabla_{\mat{\alpha}_j}l &= c_1 \sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \invlink_1(\mat{\eta}_{y_i}))_{(j)}\t{(\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}, \\
+        \vec\nabla_{\mat{\Omega}_j}l &= c_2 \mat{D}_{p_j}\t{\mat{D}_{p_j}} \reshape{(\mat{p}, \mat{p})}\!\!\Big(\sum_{i = 1}^n(\mat{t}_2(\ten{X}_i) - \invlink_2(\mat{\eta}_{y_i}))\Big)_{(j, r + j)}\vec\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_k
+    \end{align*}
+    % The Fisher Information for the GLM parameters is given block wise by
+    % \begin{displaymath}
+    %     % \mathcal{I}_{\ten{X}\mid Y = y}(\vec{\overline{\ten{\eta}}_1}, \vec\mat{\alpha}_1, ..., \vec\mat{\alpha}_r, \vec\mat{\Omega}_1, ..., \vec\mat{\Omega}_r) = \begin{pmatrix}
+    %     \mathcal{I}_{\ten{X}\mid Y = y} = \begin{pmatrix}
+    %         \mathcal{I}(\overline{\ten{\eta}}_1) & \mathcal{I}(\overline{\ten{\eta}}_1, \mat{\alpha}_1) & \cdots & \mathcal{I}(\overline{\ten{\eta}}_1, \mat{\alpha}_r) & \mathcal{I}(\overline{\ten{\eta}}_1, \mat{\Omega}_1) & \cdots & \mathcal{I}(\overline{\ten{\eta}}_1, \mat{\Omega}_r) \\
+    %         \mathcal{I}(\mat{\alpha}_1, \overline{\ten{\eta}}_1) & \mathcal{I}(\mat{\alpha}_1) & \cdots & \mathcal{I}(\mat{\alpha}_1, \mat{\alpha}_r) & \mathcal{I}(\mat{\alpha}_1, \mat{\Omega}_1) & \cdots & \mathcal{I}(\mat{\alpha}_1, \mat{\Omega}_r) \\
+    %         \vdots & \vdots & \ddots & \vdots & \vdots & \ddots & \vdots \\
+    %         \mathcal{I}(\mat{\alpha}_r, \overline{\ten{\eta}}_1) & \mathcal{I}(\mat{\alpha}_r, \mat{\alpha}_1) & \cdots & \mathcal{I}(\mat{\alpha}_r) & \mathcal{I}(\mat{\alpha}_r, \mat{\Omega}_1) & \cdots & \mathcal{I}(\mat{\alpha}_r, \mat{\Omega}_r) \\
+    %         \mathcal{I}(\mat{\Omega}_1, \overline{\ten{\eta}}_1) & \mathcal{I}(\mat{\Omega}_1, \mat{\alpha}_1) & \cdots & \mathcal{I}(\mat{\Omega}_1, \mat{\alpha}_r) & \mathcal{I}(\mat{\Omega}_1) & \cdots & \mathcal{I}(\mat{\Omega}_1, \mat{\Omega}_r) \\
+    %         \vdots & \vdots & \ddots & \vdots & \vdots & \ddots & \vdots \\
+    %         \mathcal{I}(\mat{\Omega}_r, \overline{\ten{\eta}}_1) & \mathcal{I}(\mat{\Omega}_r, \mat{\alpha}_1) & \cdots & \mathcal{I}(\mat{\Omega}_r) & \mathcal{I}(\mat{\Omega}_r, \mat{\Omega}_1) & \cdots & \mathcal{I}(\mat{\Omega}_r)
+    %     \end{pmatrix}
+    % \end{displaymath}
+    % where
+    % \begin{align*}
+    %     \mathcal{I}(\overline{\ten{\eta}}_1) &= -\sum_{i = 1}^n \cov_{\mat{\theta}_{y_i}}(\vec\ten{X}\mid Y = y_i), \\
+    %     \mathcal{I}(\mat{\alpha}_j) &= -\sum_{i = 1}^n ((\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\otimes\mat{I}_{p_j})\mat{K}_{\mat{p},(j)}\cov_{\mat{\theta}_{y_i}}(\vec\ten{X}\mid Y = y_i)\t{\mat{K}_{\mat{p},(j)}}(\t{(\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\otimes\mat{I}_{p_j}), \\
+    %     \mathcal{I}(\mat{\alpha}_j) &= -\sum_{i = 1}^n \todo{continue}
+    % \end{align*}
+
+    % \todo{Fisher Information}
+\end{theorem}
+
+Illustration of dimensions
+\begin{displaymath}
+    \underbrace{ \mat{D}_{p_j}\t{\mat{D}_{p_j}} }_{\makebox[0pt]{\scriptsize $p_j^2\times p_j^2$}}
+    %
+    \underbrace{%
+        \overbrace{\reshape{(\mat{p}, \mat{p})}\!\!\Big(\sum_{i = 1}^n
+            \underbrace{ (\mat{t}_2(\ten{X}_i) - \invlink_2(\mat{\eta}_{y_i}) }_{p^2\times 1}
+        \Big)}^{\substack{\text{(tensor of order $2 r$)}\\p_1\times p_2\times ... \times p_r\times p_1\times p_2\times ... \times p_r}} \!\!\makebox[0pt]{\phantom{\Big)}}_{(j, r + j)}
+    }_{\substack{p_j^2\times (p / p_j)^2\\\text{(matricized / put $j$ mode axis to the front)}}}
+    %
+    \underbrace{%
+        \vec \overbrace{ \bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_j }^{\makebox[0pt]{\scriptsize $(p/p_j)\times (p/p_j)$}}
+    }_{\makebox[0pt]{\scriptsize $(p/p_j)^2\times 1$}}
+\end{displaymath}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                \section{Sufficient Dimension Reduction}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+\begin{theorem}[SDR]\label{thm:sdr}
+    A sufficient reduction for the regression $y\mid \ten{X}$ under the quadratic exponential family inverse regression model \todo{reg} is given by
+    \begin{displaymath}
+        R(\ten{X}) = \vec(\ten{X}\times_{k\in[r]}\mat{\Omega}_k\mat{\alpha}_k).
+    \end{displaymath}
+    \todo{type proof in appendix}
+\end{theorem}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                    \section{Special Distributions}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+We illustrate the SDR method on two special cases, first the Tensor Normal distribution and second on the Multi-Variate Bernoulli distribution with vector, matrix and tensor valued predictors.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Tensor Normal}
+
+Let $\ten{X}, \ten{F}_y$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. We assume the inverse regression model for $\ten{X}\mid Y = y$ to be tensor normal distributed with density
+\begin{displaymath}
+    f_{\mat{\theta}_y}(\ten{X}\mid Y = y) = (2\pi)^{-p/2}\prod_{k = 1}^r |\mat{\Delta}_{k}|^{-p / 2 p_{k}}\exp\Big(
+        -\frac{1}{2}\langle \ten{X} - \ten{\mu}_y, (\ten{X} - \ten{\mu}_y)\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle
+    \Big)
+\end{displaymath}
+with location parameter tensor $\ten{\mu}_y$ depending on $y$ and the symmetric covariance matrices $\mat{\Delta}_{k}$ for each of the $k\in[r]$ modes (independent of $y$) collected in the parameter vector $\mat{\theta}_y = (\ten{\mu}_y, \mat{\Delta}_1, ..., \mat{\Delta}_r)$. Rewriting into the form of an quadratic exponential family leads to
+\begin{align*}
+    f_{\mat{\theta}_y}(\ten{X}\mid Y = y)
+        &= (2\pi)^{-p/2} \exp\Big(
+            -\frac{1}{2}\langle \ten{X}, \ten{X}\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle
+            +\langle \ten{X}, \ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_k^{-1} \rangle \\
+            &\makebox[10em]{}-\frac{1}{2}\langle \ten{\mu}_y, \ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle
+            -\sum_{k = 1}^r \frac{p}{2 p_{k}}\log|\mat{\Delta}_k|
+        \Big) \\
+        &= h(\ten{X})\exp(\t{\mat{{\eta}}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y)).
+\end{align*}
+Identifying the exponential family components gives
+\begin{align*}
+    h(\ten{X}) &= (2\pi)^{-p/2} \\
+    b(\mat{\theta}_y) &= \frac{1}{2}\langle \ten{\mu}_y, \ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_{k}^{-1} \rangle + \sum_{k = 1}^r \frac{p}{2 p_{k}}\log|\mat{\Delta}_{k}|
+\end{align*}
+and
+\begin{align*}
+    \mat{\eta}(\mat{\theta}_y) &= (\mat{\eta}_1(\mat{\theta}_y); \mat{\eta}_2(\mat{\theta}_y)) &
+    \mat{t}(\ten{X}) &= (\mat{t}_1(\ten{X}); \mat{t}_2(\ten{X}))
+\end{align*}
+where
+\begin{align*}
+    \mat{\eta}_1(\mat{\theta}_y) = \mat{\eta}_{y,1} &= \vec(\ten{\mu}_y\times_{k\in[r]}\mat{\Delta}_{k}^{-1}), &
+    \mat{t}_1(\ten{X}) &= \vec\ten{X}, \\
+    \mat{\eta}_2(\mat{\theta}_y) = \mat{\eta}_{y,2} &= -\frac{1}{2}\vec\bigotimes_{k = r}^{1}\mat{\Delta}_{k}^{-1}, &
+    \mat{t}_2(\ten{X}) &= \vec\ten{X}\otimes\vec\ten{X}.
+\end{align*}
+The natural parameters are models as described in the Multi-Linear GLM as
+\begin{align*}
+    \mat{\eta}_{y,1} &= \vec(\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_{k}) \\
+    \mat{\eta}_{y,2} &= -\frac{1}{2}\vec\bigotimes_{k = r}^{1}\mat{\Omega}_{k}.
+\end{align*}
+The intercept parameter $\overline{\ten{\eta}}_1$ is of the same dimensions as $\ten{X}$ and the reduction matrices $\mat{\alpha}_j$ are of dimensions $p_j\times q_j$ while the symmetric $\mat{\Omega}_j$ are of dimensions $p_j\times p_j$. The inverse relation from the GLM parameters to the tensor normal parameters is
+\begin{align*}
+    \ten{\mu}_y &= (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{j\in[r]}\mat{\alpha}_{j})\times_{k\in[r]}\mat{\Omega}_{k}^{-1} = (\unvec(-2\mat{\eta}_{y,2}))^{-1}\mat{\eta}_{y,1} \\
+    \mat{\Delta}_{k} &= \mat{\Omega}_{k}^{-1}
+\end{align*}
+for each $j\in[r]$. The inverse link is given by
+\begin{displaymath}
+    \invlink(\mat{\eta}_y) = \E_{\mat{\theta}_y}[\mat{t}(\ten{X})\mid Y = y]
+\end{displaymath}
+consisting of the first and second (uncentered) vectorized moments of the tensor normal distribution.
+\begin{align*}
+    \invlink_1(\mat{\eta}_y) &\equiv \E[\ten{X} \mid Y = y] = \ten{\mu}_y \\
+        &= (\ten{\eta}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k) \times_{l\in[k]}\mat{\Omega}_k^{-1} \\
+    \invlink_2(\mat{\eta}_y) &\equiv \E[\vec(\ten{X})\t{\vec(\ten{X})} \mid Y = y] \\
+        &= \cov(\vec{X} \mid Y = y) + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)} \\
+        &= \bigotimes_{k = r}^{1}\mat{\Omega}_k^{-1} + \vec(\ten{\mu}_y)\t{\vec(\ten{\mu}_y)}
+\end{align*}
+
+For estimation purposes it's also of interest to express the log-partition function $b$ in terms of the natural parameters or the GLM parameters which has the form
+\begin{displaymath}
+    b(\mat{\eta}_y) = \frac{1}{2}\t{\mat{\eta}_{y, 1}}(\unvec(-2\mat{\eta}_{y, 2}))^{-1} - \frac{1}{2}\log|\unvec(-2\mat{\eta}_{y, 2})|.
+\end{displaymath}
+
+
+Denote the Residuals as
+\begin{displaymath}
+    \ten{R}_i = \ten{X}_i - \ten{\mu}_{y_i}
+\end{displaymath}
+then with $\overline{\ten{R}} = \frac{1}{n}\sum_{i = 1}^n \ten{R}_i$ we get
+\begin{align*}
+    \nabla_{\overline{\eta}_1} l &= \overline{\ten{R}}, \\
+    \nabla_{\mat{\alpha}_j} l &= \frac{1}{n}\ten{R}_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}, \\
+    \D l(\mat{\Omega}_j) &= \frac{1}{2}\t{\vec\Big(\frac{p}{p_j}\mat{\Omega}_j^{-1} - (\ten{X} + \mu_y)_{(j)}\t{(\ten{R}\times_{k\in[r]\backslash j}\mat{\Omega}_k)_{(j)}}\Big)}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
+\end{align*}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsubsection{Initial Values}
+First we set the gradient with respect to $\overline{\ten{\eta}}_1$ to zero
+\begin{gather*}
+    0 \overset{!}{=} \nabla_{\overline{\ten{\eta}}_1}l = c_1\sum_{i = 1}^n (\ten{X}_i - \ten{\mu}_i) \\
+    \overline{\ten{X}} = (\overline{\ten{\eta}}_1 + \overline{\ten{F}_y}\times_{k\in[r]}\mat{\alpha}_{k})\times_{l\in[r]}\mat{\Omega}_{l}^{-1} \\
+    \overline{\ten{X}}\times_{l\in[r]}\mat{\Omega}_{l} = \overline{\ten{\eta}}_1 + \overline{\ten{F}_y}\times_{k\in[r]}\mat{\alpha}_{k} \approx \overline{\ten{\eta}}_1 \\
+    \overline{\ten{\eta}}_1^{(0)} = \overline{\ten{X}}\times_{k\in[r]}\mat{\Omega}_{k}^{(0)}
+\end{gather*}
+where the approximation is due to the assumption that $\E \ten{F}_y = 0$. For the initial values of the scatter matrices $\mat{\Omega}_{l}$ we simply ignore the relation to the response and simply estimate them as the marginal scatter matrices. These initial estimates overemphasize the variability in the reduction subspace. Therefore, we first compute the unscaled mode covariance estimates
+\begin{displaymath}
+    \widetilde{\mat{\Delta}}_j^{(0)} = \frac{p_j}{n p} (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{X} - \overline{\ten{X}})_{(j)}}.
+\end{displaymath}
+The next step is to scale them such that there Kronecker product has an appropriate trace
+\begin{displaymath}
+    \mat{\Delta}_j^{(0)} = \left(\frac{\|\ten{X} - \overline{\ten{X}}\|_F^2}{n \prod_{k = 1}^r \tr(\widetilde{\mat{\Delta}}_j^{(0)})}\right)^{1 / r} \widetilde{\mat{\Delta}}_j^{(0)}.
+\end{displaymath}
+Finally, the co-variances need to be inverted to give initial estimated of the scatter matrices
+\begin{displaymath}
+    \mat{\Omega}_j^{(0)} = (\mat{\Delta}_j^{(0)})^{-1}.
+\end{displaymath}
+The relay interesting part is to get initial estimates for the $\mat{\alpha}_j$ matrices. Setting the $\mat{\alpha}_j$ gradient to zero gives and substituting the initial estimates for $\overline{\ten{\eta}}_1$ and the $\mat{\Omega}_k$'s gives
+\begin{gather*}
+    0 \overset{!}{=} \nabla_{\mat{\alpha}_j}l = c_1 \sum_{i = 1}^n \reshape{\mat{p}}(\mat{t}_1(\ten{X}_i) - \mat{g}_1(\mat{\eta}_{y_i}))_{(j)}\t{(\ten{F}_{y_i}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} \\
+    (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{F}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
+        = \mat{\Omega}_j^{(0)}\mat{\alpha}_j(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\Omega}_k^{(0)}\mat{\alpha}_k)_{(j)}\t{(\ten{F}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
+\end{gather*}
+Now letting $\mat{\Sigma}_k$ be the mode co-variances of $\ten{F}_y$ and define $\ten{W}_y = \ten{F}_y\times_{k\in[r]}\mat{\Sigma}_k$ we get
+\begin{gather*}
+    (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{F}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
+        = \mat{\Omega}_j^{(0)}\mat{\alpha}_j\mat{\Sigma}_j^{1/2}(\ten{W}_y\times_{k\in[r]\backslash j}\mat{\Omega}_k^{(0)}\mat{\alpha}_k\mat{\Sigma}_k^{1/2})_{(j)}\t{(\ten{W}_y \times_{k\in[r]\backslash j}\mat{\alpha}_k \mat{\Sigma}_{k}^{1/2})_{(j)}}\mat{\Sigma}_{j}^{1/2} \\
+        = \mat{\Omega}_j^{(0)}\mat{\alpha}_j\mat{\Sigma}_j^{1/2}(\ten{W}_y)_{(j)}\Big(\mat{I}_n\otimes\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Sigma}_k^{1/2}\t{\mat{\alpha}_k}\mat{\Omega}_k^{(0)}\mat{\alpha}_k\mat{\Sigma}_{k}^{1/2}\Big)\t{(\ten{W}_y)_{(j)}}\mat{\Sigma}_{j}^{1/2}.
+\end{gather*}
+Now we let $\mat{\alpha}_j^{(0)}$ be such that $\mat{\Sigma}_k^{1/2}\t{(\mat{\alpha}^{(0)}_k)}\mat{\Omega}_k^{(0)}\mat{\alpha}^{(0)}_k\mat{\Sigma}_{k}^{1/2} = \mat{I}_{p_j}$, which leads by substitution to
+\begin{displaymath}
+    (\ten{X} - \overline{\ten{X}})_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}^{(0)}_k)_{(j)}}
+    = \mat{\Omega}_j^{(0)}\mat{\alpha}^{(0)}_j\mat{\Sigma}_j^{1/2}(\ten{W}_y)_{(j)}\t{(\ten{W}_y)_{(j)}}\mat{\Sigma}_{j}^{1/2}
+    = \frac{p_j}{n p}\mat{\Omega}_j^{(0)}\mat{\alpha}^{(0)}_j\mat{\Sigma}_j
+\end{displaymath}
+\todo{Does this make sense?!?!?!}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Ising Model}
+For the inverse regression $\ten{X}\mid Y = y$ the Ising model probability mass function with $p (p + 1) / 2$ parameters $\mat{\theta}_y$ is given by
+\begin{align*}
+    P_{\mat{\theta}_y}(\ten{X}\mid Y = y)
+        &= p_0(\mat{\theta}_y)\exp(\t{\vech(\vec(\ten{X})\t{\vec(\ten{X})})}\mat{\theta}_y) \\
+        &= h(\ten{X})\exp(\t{\mat{{\eta}}(\mat{\theta}_y)}\mat{t}(\ten{X}) - b(\mat{\theta}_y))
+\end{align*}
+where $h(\ten{X}) = 1$ and $b(\mat{\theta}_y) = -\log p_0(\mat{\theta}(\mat{\eta}_y))$.
+According to the GLM model we get
+\begin{align*}
+    \mat{\eta}_{y,1} &\equiv c_1 (\overline{\ten{\eta}}_1 + \ten{F}_y\times_{k\in[r]}\mat{\alpha}_k), &
+    \mat{\eta}_{y,2} &\equiv c_2 \bigotimes_{k = r}^{1}\mat{\Omega}_k.
+\end{align*}
+which yields the following relation to the conditional Ising model parameters
+\begin{displaymath}
+    \mat{\theta}_y = \mat{\theta}(\mat{\eta}_y) = \vech(\diag(\mat{\eta}_{y,1}) + (2_{p\times p} - \mat{I}_p) \odot \reshape{(p, p)}(\mat{\eta}_{y,2}))
+\end{displaymath}
+where the constants $c_1, c_2$ can be chosen arbitrary, as long as they are non-zero. The ``inverse'' link in then computed via the Ising model as the conditional expectation of all interactions
+\begin{align*}
+    \invlink_2(\mat{\eta}_y) \equiv \E_{\mat{\theta}_y}[\vec(\ten{X})\t{\vec(\ten{X})}\mid Y = y]
+\end{align*}
+which incorporates the first moment. In other words $\invlink_1(\mat{\eta}_y) = \diag(\E_{\mat{\theta}_y}[\vec(\ten{X})\t{\vec(\ten{X})}\mid Y = y])$.
+
+
+% The ``inverse'' link is given by
+% \begin{align*}
+%     \invlink_1(\mat{\eta}_y) &\equiv \E_{\mat{\theta}(\mat{\eta}_y)}[\ten{X} | Y = y] \\
+%     \invlink_2(\mat{\eta}_y) &\equiv \E_{\mat{\theta}(\mat{\eta}_y)}[\vec(\ten{X})\t{\vec(\ten{X})} | Y = y]
+% \end{align*}
+% and note that $\diag(\E_{\mat{\theta}(\mat{\eta}_y)}[\vec(\ten{X})\t{\vec(\ten{X})} | Y = y]) \equiv \E_{\mat{\theta}(\mat{\eta}_y)}[\ten{X} | Y = y]$.
+%
+% The gradients of the log-likelihood are now given by
+% \begin{align*}
+%     \nabla_{\overline{\ten{\eta}}_1} l
+%         &= \frac{1}{n}\sum_{i = 1}^n \ten{R}_i \\
+%     \nabla_{\mat{\alpha}_j} l
+%         &= \frac{1}{n}\ten{R}_{(j)}\t{(\ten{F}_y\times_{k\in[r]\backslash j}\mat{\alpha}_j)_{(j)}} \\
+%     \vec(\nabla_{\mat{\Omega}_j} l)
+%         &= \t{\vec( (\reshape{(\mat{p}, \mat{p})}(\overline{\mat{t}_2(\ten{X}_i)} - \E[\mat{t}_2(\ten{X})\mid Y = y_i]))_{(j, r + j)} \vec\bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{\Omega}_j)}\mat{D}_{p_j}\t{\mat{D}_{p_j}}
+% \end{align*}
+% using the notation $\overline{\mat{t}_2(\ten{X})} = \frac{1}{n}\sum_{i = 1}^n \mat{t}_2(\ten{X}_i) = \frac{1}{n}\sum_{i = 1}^n \vec(\ten{X}_i)\otimes \vec(\ten{X}_i)$.
+
+\printbibliography[heading=bibintoc,title={References}]
+\appendix
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                  \section{Vectorization and Matricization}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{displaymath}
+    \vec(\ten{A}\times_{k\in[r]}\mat{B}_k) = \Big(\bigotimes_{k = r}^1 \mat{B}_k\Big)\vec\ten{A}
+\end{displaymath}
+\begin{displaymath}
+    (\ten{A}\times_{k\in[r]}\mat{B}_k)_{(j)} = \mat{B}_j\ten{A}_{(j)}\bigotimes_{\substack{k = r\\k\neq j}}^1\t{\mat{B}_k}
+\end{displaymath}
+of which a special case is $(\ten{A}\times_{j}\mat{B}_j)_{(j)} = \mat{B}_j\ten{A}_{(j)}$.
+
+
+Let $\ten{A}$ be a $n\times p_1\times ... \times p_r\times q_1\times ... \times q_r$ tensor and $\mat{B}_k$ be $p_k\times q_k$ matrices, then
+\begin{displaymath}
+    \ten{A}_{(1)} \vec{\bigotimes_{k = r}^{1}\mat{B}_k}
+    =
+    \Big(\ten{R}(\ten{A})\times_{\substack{k + 1\\k\in[r]}}\t{\vec(\mat{B}_k)}\Big)_{(1)}
+\end{displaymath}
+where $\ten{R}$ is a permutation of the axis and reshaping of the tensor $\ten{A}$. This axis permutation converts $n\times p_1\times ... \times p_r\times q_1\times ... \times q_r$ to $n\times p_1\times q_1 \times ... \times p_r\times q_r$ and the reshaping vectorizes the axis pairs $p_k\times q_k$ leading to a tensor $\ten{R}(\ten{A})$ of dimensions $n\times p_1 q_1\times ...\times p_r q_r$.
+
+An alternative way to write this is for each of the $i\in[n]$ vector components is
+\begin{displaymath}
+    \Big(\ten{A}_{(1)}\vec{\bigotimes_{k = r}^{1}\mat{B}_k}\Big)_{i}
+        = \sum_{J\in[(\mat{p}, \mat{q})]}
+            \ten{A}_{i, J}\prod_{k = 1}^r (B_k)_{J_k, J_{k + r}}
+\end{displaymath}
+using the notation $J\in[(\mat{p}, \mat{q})] = [p_1]\times ... \times [p_r]\times [q_1]\times ... \times [q_r]$.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                        \section{Pattern Matrices}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+The \emph{duplication matrix} $\mat{D}_p$ of dimensions $p^2\times p(p + 1) / 2$ is defined implicitly such that for any symmetric $p\times p$ matrix $\mat{A}$ holds
+\begin{displaymath}
+    \mat{D}_p\vech\mat{A} = \vec{\mat{A}}.
+\end{displaymath}
+Let $\mat{A}$ by a $p\times q$ matrix, then we denote the \emph{commutation matrix} $\mat{K}_{p,q}$ as the $p q\times p q$ matrix satisfying
+\begin{displaymath}
+    \mat{K}_{p,q}\vec\mat{A} = \vec{\t{\mat{A}}}.
+\end{displaymath}
+The identity giving the commutation matrix its name is
+\begin{displaymath}
+    \mat{A}\otimes\mat{B} = \mat{K}_{a_1,b_1}(\mat{B}\otimes\mat{A})\t{\mat{K}_{a_2,b_2}}.
+\end{displaymath}
+For a generalization of the commutation matrix let $\ten{A}$ be a $p_1\times ...\times p_r$ tensor of order $r$. Then the \emph{generalized commutation matrix} $\mat{K}_{(p_1, ..., p_r),(j)}$ is implicitly defined such that
+\begin{displaymath}
+    \mat{K}_{(p_1, ..., p_r),(j)}\vec{\ten{A}} = \vec{\ten{A}_{(j)}}
+\end{displaymath}
+for every $j \in[r]$ mode. This is a direct generalization of the commutation matrix with the special case $\mat{K}_{(p,q),(2)} = \mat{K}_{p,q}$ and the trivial case $\mat{K}_{(p_1, ..., p_r),(1)} = \mat{I}_{p}$ for $p = \prod_{j=1}^r p_j$. Furthermore, with a dimension vector $\mat{p} = (p_1, ..., p_r)$ its convenient to write $\mat{K}_{(p_1, ..., p_r),(j)}$ as $\mat{K}_{\mat{p},(j)}$. Its relation to the classic Commutation matrix is given by
+\begin{displaymath}
+    \mat{K}_{\mat{p}, (j)} = \mat{I}_{\overline{p}_j} \otimes \mat{K}_{\underline{p}_j, p_j}
+\end{displaymath}
+where $\overline{p}_j = \prod_{k = j + 1}^r p_k$ and $\underline{p}_j = \prod_{k = 1}^{j - 1}p_k$ with an empty product set to $1$.
+The generalized commutation matrix gives leads to a generalization of the Kronecker product commutation identity
+\begin{displaymath}
+    \bigotimes_{\substack{k = r\\k\neq j}}^{1}\mat{A}_k\otimes \mat{A}_j = \mat{K}_{\mat{p}, (j)}\Big(\bigotimes_{k = r}^1 \mat{A}_k\Big)\t{\mat{K}_{\mat{q}, (j)}}
+\end{displaymath}
+for arbitrary matrices $\mat{A}_k$ of dimensions $p_k\times q_k$, $k \in[r]$ which are collected in the dimension vectors $\mat{p} = (p_1, ..., p_r)$ and $\mat{q} = (q_1, ..., q_r)$. Next the \emph{symmetrizer} $\mat{N}_p$ is a $p^2\times p^2$ matrix such that for any $p\times p$ matrix $\mat{A}$
+\begin{displaymath}
+    \mat{N}_p \vec{\mat{A}} = \frac{1}{2}(\vec{\mat{A}} + \vec{\t{\mat{A}}}).
+\end{displaymath}
+Another matrix which might come in handy is the \emph{selection matrix} $\mat{S}_p$ of dimensions $p^2\times p$ which selects the diagonal elements of a $p\times p$ matrix $\mat{A}$ from its vectorization
+\begin{displaymath}
+    \mat{S}_p\vec{\mat{A}} = \diag{\mat{A}}
+\end{displaymath}
+where $\diag{\mat{A}}$ denotes the vector of diagonal elements of $\mat{A}$.
+
+For two matrices $\mat A$ of dimensions $a_1\times a_2$ and $\mat B$ of dimensions $b_1\times b_2$ holds
+\begin{equation}\label{eq:vecKron}
+    \vec(\mat A\otimes\mat B) = (\mat{I}_{a_2}\otimes\mat{K}_{b_2,a_1}\otimes\mat{I}_{b_1})(\vec\mat A\otimes\vec\mat B).
+\end{equation}
+
+\begin{align*}
+    \pinv{\mat{D}_p} &= (\t{\mat{D}_p}\mat{D}_p)^{-1}\t{\mat{D}_p} \\
+    \pinv{\mat{D}_p}\mat{D}_p &= \mat{I}_{p(p+1)/2} \\
+    \mat{D}_p\pinv{\mat{D}_p} &= \mat{N}_{p} \\
+    \t{\mat{K}_{p,q}} &= \mat{K}_{p,q}^{-1} = \mat{K}_{q,p} \\
+    \t{\mat{K}_{\mat{p},(j)}} &= \mat{K}_{\mat{p},(j)}^{-1}
+\end{align*}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                          \section{Matrix Calculus}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{example}
+    We want to find the derivative with respect to any of the $r$ symmetric $p_j\times p_j$ matrices $\mat{\Omega}_j$ where $j = 1, ..., r$ of the Kronecker product
+    \begin{displaymath}
+        \mat{F} = \bigotimes_{k = r}^1 \mat{\Omega}_k.
+    \end{displaymath}
+    Therefore, denote
+    \begin{align*}
+        p &= \prod_{k = 1}^r p_k, & \overline{p}_j &= \prod_{k = j + 1}^r p_k, & \underline{p}_j &= \prod_{k = 1}^{j - 1} p_k, \\
+        & & \overline{\mat{\Omega}}_j &= \bigotimes_{k = r}^{j+1}\mat{\Omega}_k, & \underline{\mat{\Omega}}_j &= \bigotimes_{k = j - 1}^{1}\mat{\Omega}_k
+    \end{align*}
+    which slightly simplifies the following. With this notation we have $p = \overline{p}_jp_j\underline{p}_j$ for any of the $j = 1, ..., r$. Furthermore, the matrices $\overline{\mat{\Omega}}_j$ and $\underline{\mat{\Omega}}_j$ are of dimensions $\overline{p}_j\times \overline{p}_j$ and $\underline{p}_j\times \underline{p}_j$, respectively. We start with the differential
+    \begin{align*}
+        \d\mat{F} &= \d\bigotimes_{k = r}^1 \mat{\Omega}_k
+            = \sum_{j = 1}^r \bigotimes_{k = r}^{j+1}\mat{\Omega}_k\otimes\d\mat{\Omega}_j\otimes\bigotimes_{k = j - 1}^{1}\mat{\Omega}_k
+            = \sum_{j = 1}^r \overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j\otimes\underline{\mat{\Omega}}_j \\
+            &= \sum_{j = 1}^r \mat{K}_{\overline{p}_jp_j,\underline{p}_j}(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j)\mat{K}_{\underline{p}_j,\overline{p}_jp_j}
+    \end{align*}
+    By vectorizing this transforms to
+    \begin{align*}
+        \d\vec\mat{F} &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j\otimes\d\mat{\Omega}_j) \\
+            &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\d\vec\mat{\Omega}_j) \\
+            &= \sum_{j = 1}^r (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\mat{I}_{p_j^2})\,\d\vec\mat{\Omega}_j \\
+    \end{align*}
+    leading to
+    \begin{displaymath}
+        \D\mat{F}(\mat{\Omega}_j) = (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})(\vec(\underline{\mat{\Omega}}_j\otimes\overline{\mat{\Omega}}_j)\otimes\mat{I}_{p_j^2})
+    \end{displaymath}
+    for each $j = 1, ..., r$. Note that the $p^2\times p^2$ matrices
+    \begin{displaymath}
+        \mat{P}_j = (\mat{K}_{\overline{p}_jp_j,\underline{p}_j}\otimes\mat{K}_{\overline{p}_jp_j,\underline{p}_j})(\mat{I}_{\overline{p}_j\underline{p}_j}\otimes\mat{K}_{p_j,\overline{p}_j\underline{p}_j}\otimes\mat{I}_{p_j})
+    \end{displaymath}
+    are permutations.
+\end{example}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                                \section{Stuff}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+Let $X, Y$ be $p, q$ dimensional random variables, respectively. Furthermore, let $\E X = \mu_X$, $\E Y = \mu_Y$ as well as $\cov(X) = \mat{\Sigma}_X$ and $\cov(Y) = \mat{\Sigma}_Y$. Then define the standardized random variables $Z_X = \mat{\Sigma}_X^{-1/2}(X - \mu_X)$ and $Z_Y = \mat{\Sigma}_Y^{-1/2}(Y - \mu_Y)$. For the standardized variables holds $\E Z_X = 0_p$, $\E_Y = 0_q$ and for the co-variances we get $\cov(Z_X) = \mat{I}_p$ as well as $\cov(Z_Y) = \mat{I}_q$. Now we take a look at the cross-covariance between $X$ and $Y$
+\begin{displaymath}
+    \cov(X, Y)
+        = \cov(X - \mu_X, Z - \mu_Z)
+        = \cov(\mat{\Sigma}_X^{1/2} Z_X, \mat{\Sigma}_Y^{1/2} Z_Y)
+        = \mat{\Sigma}_X^{1/2}\cov(Z_X, Z_Y)\mat{\Sigma}_Y^{1/2}.
+\end{displaymath}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+                                \section{Proofs}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{proof}[Proof of Theorem~\ref{thm:sdr}]
+    abc
+\end{proof}
+
+\end{document}
diff --git a/LaTeX/bernoulli.tex b/LaTeX/bernoulli.tex
new file mode 100644
index 0000000..6be5e56
--- /dev/null
+++ b/LaTeX/bernoulli.tex
@@ -0,0 +1,656 @@
+\documentclass[a4paper, 10pt]{article}
+
+\usepackage[utf8]{inputenc}
+\usepackage[T1]{fontenc}
+\usepackage{fullpage}
+\usepackage{amsmath, amssymb, amstext, amsthm}
+\usepackage{bm}                         % \boldsymbol and italic corrections, ...
+\usepackage[pdftex]{hyperref}
+\usepackage{makeidx}                    % Index (Symbols, Names, ...)
+\usepackage{xcolor, graphicx}           % colors and including images
+\usepackage{tikz}
+\usetikzlibrary{calc}
+\usepackage[
+    % backend=bibtex,
+    style=authoryear-comp
+]{biblatex}
+\usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms
+
+% Document meta into
+\title{Bernoulli}
+\author{Daniel Kapla}
+\date{\today}
+% Set PDF title, author and creator.
+\AtBeginDocument{
+    \hypersetup{
+        pdftitle = {Bernoulli},
+        pdfauthor = {Daniel Kapla},
+        pdfcreator = {\pdftexbanner}
+    }
+}
+
+\makeindex
+
+% Bibliography resource(s)
+\addbibresource{main.bib}
+
+% Setup environments
+% Theorem, Lemma
+\theoremstyle{plain}
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}{Lemma}
+\newtheorem{example}{Example}
+% Definition
+\theoremstyle{definition}
+\newtheorem{defn}{Definition}
+% Remark
+\theoremstyle{remark}
+\newtheorem{remark}{Remark}
+
+% Define math macros
+\newcommand{\mat}[1]{\boldsymbol{#1}}
+\newcommand{\ten}[1]{\mathcal{#1}}
+\renewcommand{\vec}{\operatorname{vec}}
+\newcommand{\dist}{\operatorname{dist}}
+\newcommand{\rank}{\operatorname{rank}}
+\DeclareMathOperator{\kron}{\otimes}    % Kronecker Product
+\DeclareMathOperator{\hada}{\odot}      % Hadamard Product
+\newcommand{\ttm}[1][n]{\times_{#1}}    % n-mode product (Tensor Times Matrix)
+\DeclareMathOperator{\df}{df}
+\DeclareMathOperator{\tr}{tr}
+\DeclareMathOperator{\var}{Var}
+\DeclareMathOperator{\cov}{Cov}
+\DeclareMathOperator{\Span}{Span}
+\DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
+% \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
+\DeclareMathOperator*{\argmin}{{arg\,min}}
+\DeclareMathOperator*{\argmax}{{arg\,max}}
+\newcommand{\D}{\textnormal{D}}         % derivative
+\renewcommand{\d}{\textnormal{d}}       % differential
+\renewcommand{\t}[1]{{#1^{\prime}}}     % matrix transpose
+\newcommand{\pinv}[1]{{#1^{\dagger}}}   % `Moore-Penrose pseudoinverse`
+\renewcommand{\vec}{\operatorname{vec}}
+\newcommand{\vech}{\operatorname{vech}}
+\newcommand{\logical}[1]{{[\![#1]\!]}}
+
+\begin{document}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Bivariate Bernoulli Distribution}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+A random 2-vector $X\in\{0, 1\}^2$ follows a \emph{Bivariate Bernoulli} distribution if its pmf is
+\begin{displaymath}
+    P(X = (x_1, x_2)) = p_{00}^{(1-x_1)(1-x_2)}p_{01}^{(1-x_1)x_2}p_{10}^{x_1(1-x_2)}p_{11}^{x_1x_2}
+\end{displaymath}
+where $p_{ab} = P(X = (a, b))$ for $a, b\in\{0, 1\}$. An alternative formulation, in terms of log-odds, follows immediately as
+\begin{displaymath}
+    P(X = (x_1, x_2)) = p_{00}\exp\Big(x_1\log\frac{p_{10}}{p_{00}} + x_2\log\frac{p_{01}}{p_{00}} + x_1x_2\log\frac{p_{00}p_{11}}{p_{01}p_{10}}\Big).
+\end{displaymath}
+Collecting the log-odds in a parameter vector $\mat{\theta} = \t{(\theta_{01}, \theta_{10}, \theta_{11})}$ where
+\begin{align*}
+    \theta_{01} &= \log\frac{p_{01}}{p_{00}}, \\
+    \theta_{10} &= \log\frac{p_{10}}{p_{00}}, \\
+    \theta_{11} &= \log\frac{p_{00}p_{11}}{p_{01}p_{10}}
+\end{align*}
+the pmf can be written more compact as
+\begin{displaymath}
+    P(X = (x_1, x_2)) = P(X = \mat{x}) = p_{00}\exp(\t{\mat{\theta}}\vech(\mat{x}\t{\mat{x}}))
+        = p_{00}\exp(\t{\mat{x}}\mat{\Theta}\mat{x})
+\end{displaymath}
+with the parameter matrix $\mat{\Theta}$ defined as
+\begin{displaymath}
+    \mat{\Theta} = \begin{pmatrix}
+        \theta_{01} & \tfrac{1}{2}\theta_{11} \\
+        \tfrac{1}{2}\theta_{11} & \theta_{10}
+    \end{pmatrix} = \begin{pmatrix}
+        \log\frac{p_{01}}{p_{00}} & \tfrac{1}{2}\log\frac{p_{00}p_{11}}{p_{01}p_{10}} \\
+        \tfrac{1}{2}\log\frac{p_{00}p_{11}}{p_{01}p_{10}} & \log\frac{p_{10}}{p_{00}}
+    \end{pmatrix}.
+\end{displaymath}
+The marginal distribution of $X_1$ and $X_2$ are given by
+\begin{align*}
+    P(X_1 = x_1) &= P(X = (x_1, 0)) + P(X = (x_1, 1)) \\
+        &= p_{00}^{1-x_1}p_{10}^{x_1} + p_{01}^{1-x_1}p_{11}^{x_1} \\
+        &= \begin{cases}
+            p_{00} + p_{01}, & x_1 = 0 \\
+            p_{01} + p_{11}, & x_1 = 1
+        \end{cases} \\
+        &= (p_{00} + p_{01})^{1-x_1}(p_{01} + p_{11})^{x_1}. \\
+    P(X_2 = x_2) &= (p_{00} + p_{10})^{1-x_2}(p_{10} + p_{11})^{x_2}.
+\end{align*}
+Furthermore, the conditional distributions are
+\begin{align*}
+    P(X_1 = x_1|X_2 = x_2) = \frac{P(X = (x_1, x_2))}{P(X_2 = x_2)}
+        \propto \big(p_{00}^{1-x_2}p_{01}^{x_2}\big)^{1-x_1}\big(p_{10}^{1-x_2}p_{11}^{x_2}\big)^{x_1}, \\
+    P(X_2 = x_2|X_1 = x_1) = \frac{P(X = (x_1, x_2))}{P(X_1 = x_1)}
+        \propto \big(p_{00}^{1-x_1}p_{10}^{x_1}\big)^{1-x_2}\big(p_{01}^{1-x_1}p_{11}^{x_1}\big)^{x_2}.
+\end{align*}
+Note that both the marginal and the conditional are again Bernoulli distributed. Its also of interest to look at the covariance between the components of $X$ which are given by
+\begin{displaymath}
+    \cov(X_1, X_2) = \E[(X_1 - \E X_1)(X_2 - \E X_2)] = p_{00}p_{11} - p_{01}p_{10}
+\end{displaymath}
+which follows by direct computation.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Multivariate Bernoulli Distribution}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+This is a direct generalization of the Bivariate Bernoulli Distribution. Before we start a few notes on notation. Let $a, b$ be binary vectors, then $\logical{a = b} = 1$ if and only if $\forall i : a_i = b_i$ and zero otherwise. With that, let $Y\in\{0, 1\}^q$ be a $q$-dimensional \emph{Multivariate Bernoulli} random variable with pdf
+\begin{equation}\label{eq:mvb_pmf}
+    P(Y = y) = \prod_{a\in\{0, 1\}^q} p_a^{\logical{y = a}} = p_y.
+\end{equation}
+The parameters are $2^q$ parameters $p_a$ which are indexed by the event $a\in\{0, 1\}^q$. The ``indexing'' is done by identifying an event $a\in\{0, 1\}^q$ with the corresponding binary number $m$ the event represents. In more detail we equate an event $a\in\{0, 1\}^q$ with a number $m\in[0; 2^q - 1]$ as
+\begin{equation}\label{eq:mvb_numbering}
+    m = m(a) = \sum_{i = 1}^q 2^{q - i}a_i
+\end{equation}
+which is a one-to-one relation. For example, for $q = 3$ all events are numbered as in Table~\ref{tab:event-to-number}.
+\begin{table}[!ht]
+    \centering
+    \begin{minipage}{0.8\textwidth}
+        \centering
+        \begin{tabular}{c|c}
+            Event $a$ & Number $m(a)$ \\ \hline
+            (0, 0, 0) & 0 \\
+            (0, 0, 1) & 1 \\
+            (0, 1, 0) & 2 \\
+            (0, 1, 1) & 3 \\
+            (1, 0, 0) & 4 \\
+            (1, 0, 1) & 5 \\
+            (1, 1, 0) & 6 \\
+            (1, 1, 1) & 7
+        \end{tabular}
+        \caption{\label{tab:event-to-number}\small Event numbering relation for $q = 3$. The events $a$ are all the possible elements of $\{0, 1\}^3$ and the numbers $m$ range from $0$ to $2^3 - 1 = 7$.}
+    \end{minipage}
+\end{table}
+
+\subsection{Exponential Family and Natural Parameters}
+The Multivariate Bernoulli is a member of the exponential family. This can be seen by rewriting the pmf \eqref{eq:mvb_pmf} in terms of an exponential family. First, we take a look at $\logical{y = a}$ for two binary vectors $y, a$ which can be written as
+\begin{align*}
+     \logical{y = a}
+        &= \prod_{i = 1}^q (y_i a_i + (1 - y_i)(1 - a_i))
+         = \prod_{i = 1}^q (y_i (2 a_i - 1) + (1 - a_i)) \\
+        &= \sum_{b\in\{0, 1\}^q}\prod_{i = 1}^q [y_i (2 a_i - 1)]^{b_i}(1 - a_i)^{1-b_i} \\
+\intertext{where the last equality follows by multiplying it out similar to the binomial theorem. Note that the inner product is zero if there is at least one $i$ such that $a_i = 1$ but $b_i = 0$, cause then $(1 - a_i)^{1-b_i} = 0$ and $1$ in all other cases. Therefore, using $\logical{a \leq b}$ gives}
+    ...
+        &= \sum_{b\in\{0, 1\}^q}\logical{a\leq b}\prod_{i = 1}^q [y_i (2 a_i - 1)]^{b_i}
+\intertext{Next, given $\logical{a \leq b}$ we get that $\prod_{i = 1}^q (2 a_i - 1)^{b_i} = (-1)^\logical{|b| \equiv_2 |a|}$ by counting the number of zeros in $a$ where at the same index $b$ is one. Cause $(2 a_i - 1)^{b_i} = -1$ for $a_i = 0$ and $b_i = 1$ and $1$ in every other case. This is encoded in $|b| \equiv_2 |a|$ as this is true if there are even number of $a_i = 0$ and $b_i = 1$ cases and false otherwise. This leads to the final version of the rewriting of $\logical{y = a}$ as
+}
+    ...
+        &= \sum_{b\in\{0, 1\}^q}\logical{a\leq b}(-1)^\logical{|b|\equiv_2|a|}\prod_{i = 1}^q y_i^{b_i}.
+\end{align*}
+Now, taking the log of \eqref{eq:mvb_pmf} and substituting $\logical{y = a}$ gives
+\begin{align*}
+    \log P(Y = y)
+        &= \sum_{a\in\{0, 1\}^q}\logical{y = a}\log p_a \\
+        &= \sum_{b\in\{0, 1\}^q}\sum_{a\in\{0, 1\}^q}\log(p_a)\logical{a\leq b}(-1)^\logical{|b|\equiv_2|a|}\prod_{i = 1}^q y_i^{b_i} \\
+        &= \sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\sum_{a\leq b}\log(p_a)(-1)^\logical{|b|\equiv_2|a|} \\
+        &= \sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}
+\end{align*}
+For each $b\in\{0, 1\}^q$ except for $b = (0, ..., 0)$ define
+\begin{displaymath}
+    \theta_{m(b)} = \theta_b = \log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}, \qquad b\in\{0, 1\}^q\backslash\{0\}^q
+\end{displaymath}
+and collect the thetas in a combined vetor $\mat{\theta} = (\theta_1, \theta_2, ..., \theta_{2^q - 1})$ where we used the Bernoulli event to number identification of \eqref{eq:mvb_numbering}. The zero event is excluded here as casue its not needed. The reason therefore is that its already determined by all the other parameters and will be incorporated as the pmf scaling factor in the exponential family representation. Using the newly defined $\mat{\theta}$ we get
+\begin{align*}
+    P(Y = y) &= \exp\log P(Y = y)
+        = \exp\left(\sum_{b\in\{0, 1\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\log\frac{\prod_{a\leq b, |a|\equiv_2|b|}p_a}{\prod_{a\leq b, |a|\not\equiv_2|b|}p_a}\right) \\
+        &= p_0\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)
+\end{align*}
+The final step is to determin an representation of $p_0$ is terms of $\mat{\theta}$. But this follows simply by the fact that probabilities sum to $1$.
+\begin{align*}
+    1 &= \sum_{y\in\{0, 1\}^q}P(Y = y) = p_0\left(1 + \sum_{y\in\{0, 1\}^q\backslash\{0\}^q}\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)\right), \\
+    p_0(\mat{\theta}) &= \left(1 + \sum_{y\in\{0, 1\}^q\backslash\{0\}^q}\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)\right)^{-1}.
+\end{align*}
+This gives the pmf representation as
+\begin{equation}\label{eq:mvb_exp_fam}
+    P(Y = y) = p_0(\mat{\theta})\exp\left(\sum_{b\in\{0, 1\}^q\backslash\{0\}^q}\left(\prod_{i = 1}^q y_i^{b_i}\right)\theta_b\right)
+        = p_0(\mat{\theta})\exp(\t{T(y)}\mat{\theta})
+\end{equation}
+which proves the fact that the Multivariate Bernoulli is a member of the exponential family. Furthermore, the statistic $T(y)$ in \eqref{eq:mvb_exp_fam} is
+\begin{displaymath}
+    T(y) = \left(\prod_{i = 1}^q y_i^{b_i}\right)_{b\in\{0, 1\}^q\backslash\{0\}^q}
+\end{displaymath}
+which is a $2^q - 1$ dimensional binary vector.
+
+\subsection{Expectation and Covariance}
+First the expectation of a Multivariate Bernoulli $Y\sim\mathcal{B}_p$ is given by
+\begin{displaymath}
+    (\E Y)_j = (\E Y_j) = \sum_{\substack{y\in\{0, 1\}^q}} P(Y = y)y_j = \sum_{\substack{y\in\{0, 1\}^q\\y_j = 1}}P(Y = y)
+\end{displaymath}
+for each of the $j = 1, ..., q$ components of the $q$-dimensional random vector $Y$. Its covariance is similar given by
+\begin{displaymath}
+    \cov(Y_i, Y_j)
+        = \E(Y_i - \E Y_i)(Y_j - \E Y_j)
+        = \E Y_i Y_j - (\E Y_i)(\E Y_j)
+        = \sum_{\substack{y\in\{0, 1\}^q\\y_i = y_j = 1}}P(Y = y) - (\E Y_i)(\E Y_j).
+\end{displaymath}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{The Ising Model}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+The Ising model is a special case of the Mutlivariate Bernoulli with pmf defined directly in its exponential family form by
+\begin{displaymath}
+    P_{\mat{\theta}}(Y = \mat{y}) = p_0(\mat{\theta})\exp\left(\sum_{i = 1}^q \theta_{\iota(i, i)}y_i + \sum_{i = 1}^{q - 1} \sum_{j = i + 1}^q \theta_{\iota(i, j)}y_i y_j\right).
+\end{displaymath}
+This ``constraint'' model only considures two way interactions with the natural parameters $\mat{\theta}$ of size $q(q + 1)/2$. The indexing function $\iota$ maps the vector indices to the corresponding parameter indices
+\begin{align*}
+    \iota(i, j) &= \iota_0(\min(i, j) - 1, \max(i, j) - 1) + 1, & i, j &= 1, ..., q \\
+\intertext{with the $0$-indexed mapping}
+    \iota_0(i, j) &= \frac{i (2 q + 1 - i)}{2} + (j - i) & i, j &= 0, ..., q - 1\text{ and }i\leq j.
+\end{align*}
+This index mapping is constructed such that the half vectorization of the outer product $\mat{y}\t{\mat{y}}$ corresponds in its singe and two way interactions between components to the appropriate parameter indices in $\mat{\theta}$. In other words, above pmf can be written as
+\begin{displaymath}
+    P_{\mat{\theta}}(Y = \mat{y}) = p_0(\mat{\theta})\exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ).
+\end{displaymath}
+The scaling factor $p_0(\mat{\theta})$ (which is also the probability of the zero event, therefore the name) is
+\begin{displaymath}
+    p_0(\mat{\theta}) = \Big( \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ) \Big)^{-1}.
+\end{displaymath}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Conditional Distribution}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+{\color{red} TODO: Fix this, its wrong!!!
+
+For the conditional distribution under the Ising model let $I\subsetneq[q]$ be a non-empty index set (non-empty cause this corresponds to no conditioning and not equal to avoid $P_{\mat{\theta}}(\emptyset|Y = \mat{y})$). Then denote with $\mat{y}_I$ the $|I|$ sub-vector of $\mat{y}$ consisting only of the indices in $I$ while $\mat{y}_{-I}$ is the $q - |I|$ vector with all indices \emph{not} in $I$.
+\begin{displaymath}
+    P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
+        = \frac{P_{\mat{\theta}}(Y = \mat{y})}{P_{\mat{\theta}}(Y_{-I} = \mat{y}_{-I})}
+        \propto (\mat{a}\mapsto P_{\mat{\theta}}(Y_{I} = \mat{a}, Y_{-I} = \mat{y}_{-I}))|_{\mat{a} = \mat{y}_I}
+\end{displaymath}
+now noting that
+\begin{displaymath}
+    \vech(\mat{y}\t{\mat{y}}) = \vech\Big(\mat{y}\t{\big(\mat{y} - \sum_{i\in I}y_i\mat{e}_i\big)}\Big)
+        + \vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)
+\end{displaymath}
+with the left summand depending only on $\mat{y}_{-I}$ due to the half vectorization only considureing the lower triangular part and the main diagonal. By substitution into the conditional probability we get
+\begin{align*}
+    P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
+        &\propto \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} ) \\
+        &= \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\mat{y} - \sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big)
+            \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big) \\
+        &\propto \exp\Big( \t{\vech\Big(\mat{y}\t{\big(\sum_{i\in I}y_i\mat{e}_i\big)}\Big)}\mat{\theta} \Big) \\
+        &= \prod_{i\in I}\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
+\end{align*}
+leading to the scaled form
+\begin{displaymath}
+    P_{\mat{\theta}}(Y_{I} = \mat{y}_{I} | Y_{-I} = \mat{y}_{-I})
+        = p_0(\mat{\theta} | Y_{-I} = \mat{y}_{-I})\prod_{i\in I}\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
+\end{displaymath}
+with
+\begin{displaymath}
+    p_0(\mat{\theta} | Y_{-I} = \mat{y}_{-I})
+        = P_{\mat{\theta}}(Y_{I} = \mat{0} | Y_{-I} = \mat{y}_{-I})
+        = \Big(\sum_{\substack{\mat{a}\in\{0, 1\}^q\\\mat{a}_{-I} = \mat{y}_{-I}}}
+        \prod_{i\in I}\exp( \t{\vech(\mat{a}\t{\mat{e}_i})}\mat{\theta} )^{a_i}\Big)^{-1}.
+\end{displaymath}
+
+} % end of TODO: Fix this, its wrong!!!
+
+Two special cases are of interest, first the single component case with $I = \{i\}$
+\begin{displaymath}
+    P_{\mat{\theta}}(Y_{i} = {y}_{i} | Y_{-i} = \mat{y}_{-i})
+        = \frac{\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}}{1 + \exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )}
+\end{displaymath}
+and with $\mat{y}_{-i} = \mat{0}$ we get
+\begin{displaymath}
+    P_{\mat{\theta}}(Y_{i} = {y}_{i} | Y_{-i} = \mat{0})
+    = \frac{\exp( {\theta}_{\iota(i)} )^{y_i}}{1 + \exp( {\theta}_{\iota(i)} )}
+\end{displaymath}
+leading to
+\begin{displaymath}
+    \theta_{\iota(i)} = \log\frac{P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}{P_{\mat{\theta}}(Y_{i} = 0 | Y_{-i} = \mat{0})} = \log\frac{P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}{1 - P_{\mat{\theta}}(Y_{i} = 1 | Y_{-i} = \mat{0})}
+\end{displaymath}
+
+The second case considures the conditional distribution of two components given all the rest is zero, meaning that $I = \{i, j\}$ and $\mat{y}_{-i,-j} = \mat{0}$ which has the form
+\begin{align*}
+    P_{\mat{\theta}}(Y_{i} = {y}_{i}, Y_{j} = {y}_{j} | Y_{-i,-j} = \mat{0})
+        &= p_0(\mat{\theta} | Y_{-i,-j} = \mat{0})\exp( \t{\vech(\mat{y}\t{\mat{e}_i})}\mat{\theta} )^{y_i}
+            \exp( \t{\vech(\mat{y}\t{\mat{e}_j})}\mat{\theta} )^{y_j} \\
+        &= p_0(\mat{\theta} | Y_{-i,-j} = \mat{0})\exp(y_i\theta_{\iota(i)} + y_j\theta_{\iota(j)} + y_iy_j\theta_{\iota(i,j)})
+\end{align*}
+By setting the combinations of $y_i, y_j\in\{0, 1\}$ we get that
+\begin{align*}
+    \theta_{\iota(i, j)}
+        &= \log\frac{P_{\mat{\theta}}(Y_{i} = 0, Y_{j} = 0 | Y_{-i,-j} = \mat{0})P_{\mat{\theta}}(Y_{i} = 1, Y_{j} = 1 | Y_{-i,-j} = \mat{0})}{P_{\mat{\theta}}(Y_{i} = 0, Y_{j} = 1 | Y_{-i,-j} = \mat{0})P_{\mat{\theta}}(Y_{i} = 1, Y_{j} = 0 | Y_{-i,-j} = \mat{0})} \\
+        &= \log\frac{P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i,-j} = \mat{0})}{(1 - P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i,-j} = \mat{0}))}\frac{(1 - P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0})P_{\mat{\theta}}(Y_j = 1 | Y_{-j} = \mat{0}))}{P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0})P_{\mat{\theta}}(Y_j = 1 | Y_{-j} = \mat{0})}.
+\end{align*}
+Note that we have expressed all of the natural parameters $\mat{\theta}$ in terms conditional probabilities. Ether one component is $1$ and the rest is conditioned to be zero of two components are $1$ and the rest is conditional zero. This means that the set of those conditional probabilities is a sufficient statistic. Lets denote the vector of those as $\mat{\pi}$ with the same index mapping as used in $\theta$, more precise the the $q(q + 1) / 2$ dimensional vector $\mat{\pi}$ be defined component wise as
+\begin{align*}
+    {\pi}_{\iota(i)} = {\pi}(\mat{\theta})_{\iota(i)} &= P_{\mat{\theta}}(Y_i = 1 | Y_{-i} = \mat{0}) = \frac{\exp({\theta}_{\iota(i)})}{1 + \exp({\theta}_{\iota(i)})} \\
+    {\pi}_{\iota(i, j)} = {\pi}(\mat{\theta})_{\iota(i, j)} &= P_{\mat{\theta}}(Y_i = Y_j = 1 | Y_{-i, -j} = \mat{0}) \\
+        &= \frac{\exp(\theta_{\iota(i)} + \theta_{\iota(j)} + \theta_{\iota(i, j)})}{1 + \exp(\theta_{\iota(i)}) + \exp(\theta_{\iota(j)}) + \exp(\theta_{\iota(i)} + \theta_{\iota(j)} + \theta_{\iota(i, j)})}
+\end{align*}
+and the component wise inverse relation is given by
+\begin{equation}
+    \begin{aligned}[c]
+        \theta_{\iota(i)} = \theta(\mat{\pi})_{\iota(i)}
+            &= \log\frac{\pi_{\iota(i)}}{1 - \pi_{\iota(i)}} \\
+        \theta_{\iota(i, j)} = \theta(\mat{\pi})_{\iota(i, j)}
+            &= \log\frac{(1 - \pi_{\iota(i)}\pi_{\iota(j)})\pi_{\iota(i, j)}}{\pi_{\iota(i)}\pi_{\iota(j)}(1 - \pi_{\iota(i, j)})}.
+    \end{aligned}\label{eq:ising_theta_from_cond_prob}
+\end{equation}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Log-Likelihood, Score and Fisher Information}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+The log-likelihood for a given dataset $\mat{y}_i$, $n = 1, ..., n$ is
+\begin{displaymath}
+    l(\mat{\theta})
+        = \log\prod_{i = 1}^n P_{\mat{\theta}}(Y = \mat{y}_i)
+        = \log\prod_{i = 1}^n p_0(\mat{\theta})\exp( \t{\vech(\mat{y}_i\t{\mat{y}_i})}\mat{\theta} )
+        = n\log p_0(\mat{\theta}) + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\mat{\theta}.
+\end{displaymath}
+For computing the Score we first get the differential
+\begin{align*}
+    \d l(\mat{\theta})
+        &= n p_0(\mat{\theta})^{-1} \d p_0(\mat{\theta}) + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\d\mat{\theta} \\
+        &= -n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} + \sum_{i = 1}^n \t{\vech(\mat{y}_i\t{\mat{y}_i})}\d\mat{\theta}
+\end{align*}
+leading to the Score
+\begin{align}
+    \nabla_{\mat{\theta}} l
+        &=  -n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}}) + \sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}) \nonumber \\
+        &= -n \E_{\mat{\theta}} \vech(Y\t{Y}) + \sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}). \label{eq:ising_score}
+\end{align}
+The second differential of the log-likelihood is
+\begin{multline*}
+    \d^2 l(\mat{\theta})
+        = n p_0(\mat{\theta})^2 \d\t{\mat{\theta}} \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}}) \sum_{y\in\{0, 1\}^q} \exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} \\
+            - n p_0(\mat{\theta}) \sum_{y\in\{0, 1\}^q} \d\t{\mat{\theta}}\exp( \t{\vech(\mat{y}\t{\mat{y}})}\mat{\theta} )\vech(\mat{y}\t{\mat{y}})\t{\vech(\mat{y}\t{\mat{y}})}\d\mat{\theta} + 0
+\end{multline*}
+leading to the Hessian
+\begin{align*}
+    \nabla^2_{\mat{\theta}} l
+        &= n (\E_{\mat{\theta}}\vech(Y\t{Y}))\t{(\E_{\mat{\theta}}\vech(Y\t{Y}))} - n \E_{\mat{\theta}}\vech(Y\t{Y})\t{\vech(Y\t{Y})} \\
+        &= -n \cov_{\mat{\theta}}(\vech(Y\t{Y}), \vech(Y\t{Y})).
+\end{align*}
+From this the Fisher Information is directly given as
+\begin{displaymath}
+    \mathcal{I}(\mat{\theta}) = -\E_{\mat{\theta}} \nabla^2_{\mat{\theta}} l
+        = n \cov_{\mat{\theta}}(\vech(Y\t{Y}), \vech(Y\t{Y})).
+\end{displaymath}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Estimation}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+For the estimation we use the Fisher scoring algorithm which is an iterative updating algorithm following the updating rule
+\begin{displaymath}
+    \mat{\theta}_{t+1} = \mat{\theta}_{t} + \mathcal{I}(\mat{\theta})^{-1}\nabla_{\mat{\theta}} l(\mat{\theta}).
+\end{displaymath}
+The base idea behing Fisher scoring is to considure a Taylor expantion of the score
+\begin{displaymath}
+    \nabla_{\mat{\theta}} l(\mat{\theta}^*) \approx \nabla_{\mat{\theta}}l(\mat{\theta}) + \nabla^2_{\mat{\theta}} l(\mat{\theta})(\mat{\theta}^* - \mat{\theta}).
+\end{displaymath}
+Setting $\mat{\theta}^*$ to the MLE estimate $\widehat{\mat{\theta}}$ we get with $\nabla_{\mat{\theta}} l(\widehat{\mat{\theta}}) = 0$ that
+\begin{align*}
+    0 &\approx \nabla_{\mat{\theta}}l(\mat{\theta}) + \nabla^2_{\mat{\theta}} l(\mat{\theta})(\widehat{\mat{\theta}} - \mat{\theta}) \\
+    \widehat{\mat{\theta}} &\approx \mat{\theta} - (\nabla^2_{\mat{\theta}} l(\mat{\theta}))^{-1}\nabla_{\mat{\theta}}l(\mat{\theta}).
+\end{align*}
+Now, replacing the observed information $\nabla^2_{\mat{\theta}} l(\mat{\theta})$ with the Fisher information $\mathcal{I}(\mat{\theta}) = -\E_{\mat{\theta}} \nabla^2_{\mat{\theta}} l(\mat{\theta})$ leads to
+\begin{displaymath}
+    \widehat{\mat{\theta}} \approx \mat{\theta} + \mathcal{I}(\mat{\theta})^{-1}\nabla_{\mat{\theta}}l(\mat{\theta})
+\end{displaymath}
+which is basically above updating rule.
+
+For an initial estimate $\mat{\theta}_0$ we can evaluate the Score \eqref{eq:ising_score} at the MLE estimate $\widehat{\mat{\theta}}$ leading to
+\begin{displaymath}
+    \E_{\widehat{\mat{\theta}}} \vech(Y\t{Y}) = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}).
+\end{displaymath}
+With $\E_{\widehat{\mat{\theta}}} \vech(Y\t{Y})_{\iota(i, j)} = P_{\widehat{\mat{\theta}}}(Y_iY_j = 1)$ we may treat the marginal probabilites to be not that far of the conditional probabilities and set $\mat{\pi}_0 = \E_{\widehat{\mat{\theta}}} \vech(Y\t{Y})$ from which we can compute $\mat{\theta}_0 = \mat{\theta}(\mat{\pi}_0)$ as in \eqref{eq:ising_theta_from_cond_prob}.
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{The Ising Model with Covariates}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+Now assume additional covariates $X\in\mathbb{R}^p$ given to the Multivariate Bernoulli variable $Y\in\{0, 1\}^q$ under the Ising model. There relation is given by assuming that $Y$ follows the Ising model given $X$
+\begin{align*}
+    P_{\mat{\theta}_X}(Y = \mat{y} | X)
+        &= p_0(\mat{\theta}_X)\exp(\t{\vech(\mat{y} \t{\mat{y}})}\mat{\theta}_X) \\
+        &= p_0(\mat{\theta}_X)\exp(\t{\mat{y}}\mat{\Theta}_X\mat{y}).
+\end{align*}
+with $\mat{\Theta}$ beeing a $q\times q$ symmetric matrix such that $\t{\mat{y}}\mat{\Theta}\mat{y} = \t{\vech(\mat{y} \t{\mat{y}})}\mat{\theta}$ for any $\mat{y}$. The explicit relation is
+\begin{align*}
+    \mat{\Theta} &= \tfrac{1}{2}(\mat{1}_q\t{\mat{1}_q} + \mat{I}_q)\odot\vech^{-1}(\mat{\theta}), \\
+    \mat{\theta} &= \vech((2\mat{1}_q\t{\mat{1}_q} - \mat{I}_q)\odot\mat{\Theta}).
+\end{align*}
+Assuming for centered $\E X = 0$ (w.l.o.g. cause we can always replace $X$ with $X - \E X$) that the covariate dependent parameters $\mat{\Theta}_X$ relate to $X$ by
+\begin{displaymath}
+    \mat{\Theta}_X = \t{\mat{\alpha}}X\t{X}\mat{\alpha}
+\end{displaymath}
+for an unconstraint parameter matrix $\mat{\alpha}$ of dimensions $p\times q$ leads to the Ising model with covariates of the form
+\begin{displaymath}
+    P_{\mat{\alpha}}(Y = \mat{y} | X)
+        = p_0(\mat{\alpha}, X)\exp(\t{\mat{y}}\t{\mat{\alpha}}X\t{X}\mat{\alpha}\mat{y}).
+\end{displaymath}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Log-likelihood, Score and Fisher information}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+Give a single observation $(\mat{y}, \mat{x})$ the log-likelihood is
+\begin{displaymath}
+    l(\mat{\alpha}) = \log P_{\mat{\alpha}}(Y = \mat{y} \mid X = \mat{x})
+        = \log p_0(\mat{\alpha}, \mat{x}) + \t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}.
+\end{displaymath}
+Before we write the differential of the log-likelihood we take a look at the following
+\begin{align*}
+    \d(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})
+       &= \d\tr(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}) \\
+       &= 2 \tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
+       &= 2 \t{\vec(\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) \\
+    \d\log p_0(\mat{\alpha}, \mat{x})
+       &= -p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\d\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y}) \\
+       &= -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
+       &= -2 \tr(\E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \\
+       &= -2 \t{\vec(\mat{\alpha})}(\E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha})
+\end{align*}
+Therefore, the differential of the log-likelihood is
+\begin{displaymath}
+    \d l(\mat{\alpha})
+        = 2 \tr((\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}])\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha})
+\end{displaymath}
+or equivalently the Score
+\begin{align}
+    \nabla_{\mat{\alpha}}l
+        &= 2 \mat{x}\t{\mat{x}}\mat{\alpha}(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y}\mid X = \mat{x}]) \nonumber \\
+        &= 2 \mat{x}\t{\mat{x}}\mat{\alpha}\vec^{-1}(\mat{D}_q(\vech(\mat{y}\t{\mat{y}}) - \E_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}])) \label{eq:ising_cond_score} \\
+        &= 2 \mat{x}\t{\mat{x}}\mat{\alpha}\vec^{-1}(\mat{D}_q\nabla_{\mat{\theta}}l(\mat{\theta}_{\mat{\alpha}}(\mat{x}); \mat{y})) \nonumber
+\end{align}
+where $\nabla_{\mat{\theta}}l(\mat{\theta}_{\mat{\alpha}}(\mat{x}); \mat{y})$ is the Score \eqref{eq:ising_score} for a single observation and $\mat{D}_q$ is the dublication matrix.
+
+Now we continue with the second-order differential of the log-likelihood, therefore considure
+\begin{align*}
+    \d^2(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})
+        &= 2 \tr(\mat{y}\t{\mat{y}}\t{(\d\mat{\alpha})}\mat{x}\t{\mat{x}}\d\mat{\alpha}) + 2\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d^2\mat{\alpha}) \\
+        &= 2\t{\vec(\d\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) + 0.
+\end{align*}
+The next term is
+\begin{align*}
+    \d^2 \log p_0(\mat{\alpha}, \mat{x})
+        &= \d\Big( -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\tr(\mat{y}\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\d\mat{\alpha}) \Big) \\
+\intertext{To shorten the expressions let $A_{\mat{y}} = (\mat{y}\t{\mat{y}}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}}$, then}
+    \ldots &= \d\Big( -2p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
+        &= -2(\d p_0(\mat{\alpha}, \mat{x}))\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
+        &\qquad -4 p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{\vec(\d\mat{\alpha})}A_{\mat{y}}\t{A_{\mat{y}}}\vec(\d\mat{\alpha}) \Big) \\
+        &\qquad\qquad -2 p_0(\mat{\alpha}, \mat{x})\sum_{y\in\{0, 1\}^q}\exp(\t{\mat{y}}\t{\mat{\alpha}}\mat{x}\t{\mat{x}}\mat{\alpha}\mat{y})\t{\vec(\d\mat{\alpha})}(\mat{y}\t{\mat{y}}\otimes\mat{x}\t{\mat{x}})\vec(\d\mat{\alpha}) \Big) \\
+        &= 4\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[A_{Y} \mid X = \mat{x}]\E_{\mat{\alpha}}[\t{A_{Y}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
+        &\qquad -4\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[A_{Y}\t{A_{Y}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
+        &\qquad\qquad -2\t{\vec(\d\mat{\alpha})} \E_{\mat{\alpha}}[Y\t{Y}\otimes \mat{x}\t{\mat{x}} \mid X = \mat{x}] \vec(\d\mat{\alpha}) \\
+        &= -\t{\vec(\d\mat{\alpha})}( 4\cov_{\mat{\alpha}}(A_{Y} \mid X = \mat{x})
+            + 2 \E_{\mat{\alpha}}[Y\t{Y}\otimes \mat{x}\t{\mat{x}} \mid X = \mat{x}]) \vec(\d\mat{\alpha})
+\end{align*}
+Back substituting to get the second order differential of the log-likelihood yields
+\begin{displaymath}
+    \d^2 l(\mat{\alpha}) = \t{\vec(\d\mat{\alpha})}(
+        2(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y} \mid X = \mat{x}])\otimes \mat{x}\t{\mat{x}}
+        - 4\cov_{\mat{\alpha}}(A_{Y} \mid X = \mat{x})
+    ) \vec(\d\mat{\alpha})
+\end{displaymath}
+leading to the Hessian of the log-likelihood
+\begin{displaymath}
+    \nabla^2_{\vec{\mat{\alpha}}} l
+        = 2(\mat{y}\t{\mat{y}} - \E_{\mat{\alpha}}[Y\t{Y} \mid X = \mat{x}])\otimes \mat{x}\t{\mat{x}}
+        - 4\cov_{\mat{\alpha}}((Y\t{Y}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}} \mid X = \mat{x}).
+\end{displaymath}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Estimation}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+{\color{red}This needs to be figured out how to do this in a good way.}
+
+For the initial value we start by equating the Score \eqref{eq:ising_cond_score} to zero giving us
+\begin{displaymath}
+    \widehat{\E}_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}] = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i})
+\end{displaymath}
+as an estimate of the marginal probabilities of the singe and two way interaction effects $\E_{\mat{\theta}_{\mat{\alpha}}(\mat{x})}[\vech(Y\t{Y})\mid X=\mat{x}]$. By assuming that the marginal probabilities are similar to the conditional probabilities $\mat{\pi}$ we simply equate them to get an initial estimate for the conditional probabilities
+\begin{displaymath}
+    \widehat{\mat{\pi}}_0 = \frac{1}{n}\sum_{i = 1}^n \vech(\mat{y}_i\t{\mat{y}_i}).
+\end{displaymath}
+Using the relation \eqref{eq:ising_theta_from_cond_prob} we compute an initial estimate for the natural parameters
+\begin{displaymath}
+    \widehat{\mat{\theta}}_0 = \widehat{\mat{\theta}}(\widehat{\mat{\pi}}_0)
+\end{displaymath}
+and convert it to the matrix version of the parameters
+\begin{displaymath}
+    \widehat{\mat{\Theta}}_0 = \tfrac{1}{2}(\mat{1}_q\t{\mat{1}_q} + \mat{I}_q)\odot \vech^{-1}(\widehat{\mat{\theta}}_0).
+\end{displaymath}
+Let $\widehat{\mat{\Sigma}} = \frac{1}{n}\sum_{i = 1}^n \mat{x}_i\t{\mat{x}_i}$ then we get
+\begin{displaymath}
+    \widehat{\mat{\Theta}}_0 = \t{\widehat{\mat{\alpha}}_0}\widehat{\mat{\Sigma}}\widehat{\mat{\alpha}}_0.
+\end{displaymath}
+Next we define $m = \min(p, q)$ and take an rank $m$ approximation of both $\widehat{\mat{\Theta}}_0$ and $\widehat{\mat{\Sigma}}_0$ via an SVD. These approximations have the form
+\begin{align*}
+    \widehat{\mat{\Theta}}_0 &\approx \mat{U}_{\widehat{\mat{\Theta}}_0} \mat{D}_{\widehat{\mat{\Theta}}_0} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}} \\
+    \widehat{\mat{\Sigma}}_0 &\approx \mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0} \t{\mat{U}_{\widehat{\mat{\Sigma}}_0}}
+\end{align*}
+where $\mat{U}_{\widehat{\mat{\Theta}}_0}$, $\mat{U}_{\widehat{\mat{\Sigma}}_0}$ are semi-orthogonal matrices of dimensions $q\times m$, $p\times m$, respectively. Both the diagonal matrices $\mat{D}_{\widehat{\mat{\Theta}}_0}$, $\mat{D}_{\widehat{\mat{\Theta}}_0}$ have dimensions $m\times m$. Substitution of the approximations into above $\widehat{\mat{\Theta}}_0$ to $\widehat{\mat{\alpha}}_0$ relation yields
+\begin{displaymath}
+    \mat{U}_{\widehat{\mat{\Theta}}_0} \mat{D}_{\widehat{\mat{\Theta}}_0} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}} \approx \t{\widehat{\mat{\alpha}}_0}\mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0} \t{\mat{U}_{\widehat{\mat{\Sigma}}_0}}\widehat{\mat{\alpha}}_0.
+\end{displaymath}
+Solving for $\widehat{\mat{\alpha}}_0$ leads to out initial value estimate
+\begin{displaymath}
+    \widehat{\mat{\alpha}}_0 = \mat{U}_{\widehat{\mat{\Sigma}}_0} \mat{D}_{\widehat{\mat{\Sigma}}_0}^{-1/2}\mat{D}_{\widehat{\mat{\Theta}}_0}^{1/2} \t{\mat{U}_{\widehat{\mat{\Theta}}_0}}.
+\end{displaymath}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newpage
+{\color{red}Some notes}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+With the conditional Fisher information $\mathcal{I}_{Y\mid X = \mat{x}}$ as
+\begin{displaymath}
+    \mathcal{I}_{Y\mid X = \mat{x}}(\vec\mat{\alpha})
+        = -\E_{\mat{\alpha}}[\nabla^2_{\vec{\mat{\alpha}}} l \mid X = \mat{x}]
+        = 4\cov_{\mat{\alpha}}((Y\t{Y}\otimes \mat{x}\t{\mat{x}})\vec{\mat{\alpha}} \mid X = \mat{x}).
+\end{displaymath}
+If we know that $X\sim f_X$ with a pdf (or pmf) $f_X$ for $X$ we get
+\begin{displaymath}
+    \mathcal{I}_{Y|X}(\vec\mat{\alpha}) = \int\mathcal{I}_{Y\mid X = \mat{x}}(\vec\mat{\alpha})f_X(\mat{x})\,\d\mat{x}
+        = 4\E_{X}\cov_{\mat{\alpha}}((Y\t{Y}\otimes X\t{X})\vec{\mat{\alpha}} \mid X)
+\end{displaymath}
+by
+\begin{displaymath}
+    f_{\theta}(Y, X) = f_{\theta}(Y\mid X)f_{\theta}(X)
+\end{displaymath}
+it follows that
+\begin{displaymath}
+    l_{X, Y}(\theta) = \log f_{\theta}(Y, X)
+        = \log f_{\theta}(Y\mid X) + \log f_{\theta}(X)
+        = l_{Y\mid X}(\theta) + l_{X}(\theta)
+\end{displaymath}
+% With the fisher information in general beeing defined (under any distribution) $\mathcal{I}(\theta) = \E_{\theta} \nabla l(\theta)$ with the classical Fisher information under the joint distribution of $X, Y$ to be
+% \begin{align*}
+%     \mathcal{I}_{X, Y}(\theta)
+%         &= \E_{X, Y}\nabla l_{X, Y}(\theta) \\
+%         &= \E_{X, Y}\nabla l_{Y\mid X}(\theta) + \E_{X, Y}\nabla l_{X}(\theta) \\
+%         &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X, Y}\nabla l_{X}(\theta) \\
+%         &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X}\E_{Y\mid X}[\nabla l_{X}(\theta)\mid X] \\
+%         &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] + \E_{X}\nabla l_{X}(\theta) \\
+%         &= \mathcal{I}_{Y\mid X}(\theta) + \mathcal{I}_{X}(\theta)
+% \end{align*}
+
+% What happens if the know the conditional distribution $Y\mid X\sim f_{\theta}$ only, meaning we not know the distribution of $X$. Then the log-likelihood for a data set $(y_i, x_i)$ with $i = 1, ..., n$ observations has the form
+% \begin{displaymath}
+%     l_{Y\mid X}(\theta) = \log\prod_{i = 1}^n f_{\theta}(Y = y_i \mid X = x_i)
+%         = \sum_{i = 1}^n \log f_{\theta}(Y = y_i \mid X = x_i)
+% \end{displaymath}
+% leading to
+% \begin{displaymath}
+%     \nabla l_{Y\mid X}(\theta) = \sum_{i = 1}^n \nabla \log f_{\theta}(Y = y_i \mid X = x_i)
+% \end{displaymath}
+
+\appendix
+\section{Notes on Fisher Information}
+Let $X\sim f_{\theta}$ be a random variable following a parameterized pdf (of pmf) $f_{\theta}$ with parameter vector $\theta$. Its log-likelihood (on the population, it is itself a random variable) is then
+\begin{displaymath}
+    l(\theta) = \log f_{\theta}(X)
+\end{displaymath}
+and the Score is defined as the derivative of the log-likelihood
+\begin{displaymath}
+    \nabla l(\theta) = \nabla\log f_{\theta}(X).
+\end{displaymath}
+The expectation of the Score is
+\begin{displaymath}
+    \E \nabla l(\theta)
+        = \int \nabla f_{\theta}(x)\log f_{\theta}(x)\,\d x
+        = \int \nabla f_{\theta}(x)\,\d x
+        = \nabla \int f_{\theta}(x)\,\d x
+        = \nabla 1
+        = 0.
+\end{displaymath}
+The Fisher information is defined as follows which is identical to the covariance of the Score due to the zero expectation of the Score
+\begin{displaymath}
+    \mathcal{I}(\theta) = \E \nabla l(\theta)\t{\nabla l(\theta)}.
+\end{displaymath}
+
+Now assume we have two random variable $X, Y$ and a parameter vector $\theta$, then the joint distributed relates to the conditional and the marginal distribution by
+\begin{displaymath}
+    f_{\theta}(X, Y) = f_{\theta}(Y\mid X)f_{\theta}(X)
+\end{displaymath}
+leading to the log-likelihood
+\begin{displaymath}
+    l_{X, Y}(\theta) = \log f_{\theta}(X, Y) = \log f_{\theta}(Y\mid X) + \log f_{\theta}(X)
+        = l_{Y\mid X}(\theta) + l_{X}(\theta).
+\end{displaymath}
+The Score relates identical due to the linearity of differentiation
+\begin{displaymath}
+    \nabla l_{X, Y}(\theta)
+        = \nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta)
+\end{displaymath}
+but for the Fisher Information its (due to a different argument) the same cause
+\begin{align*}
+    \mathcal{I}_{X, Y}(\theta)
+        &= \E_{X, Y}\nabla l_{X, Y}(\theta)\t{\nabla l_{X, Y}(\theta)} \\
+        &= \E_{X, Y}(\nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta))\t{(\nabla l_{Y\mid X}(\theta) + \nabla l_{X}(\theta))} \\
+        &= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
+            + \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{X}(\theta)} \\
+        &\qquad + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
+                    + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
+        &= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
+            + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)}
+\end{align*}
+where the last equality is due to
+\begin{displaymath}
+    \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{X}(\theta)}
+        = \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X]\t{\nabla l_{X}(\theta)}
+        = 0
+\end{displaymath}
+using the $\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\mid X] = 0$ as the expectation of the Score. The second term is identical and therefore we get
+\begin{align*}
+    \mathcal{I}_{X, Y}(\theta)
+        &= \E_{X, Y}\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)}
+            + \E_{X, Y}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
+        &= \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)} \mid X]
+            + \E_{X}\nabla l_{X}(\theta)\t{\nabla l_{X}(\theta)} \\
+        &= \mathcal{I}_{Y\mid X}(\theta) + \mathcal{I}_{X}(\theta).
+\end{align*}
+Note the conditional Fisher Information which has the form
+\begin{displaymath}
+    \mathcal{I}_{Y\mid X}(\theta)
+        = \E_{X}\E_{Y\mid X}[\nabla l_{Y\mid X}(\theta)\t{\nabla l_{Y\mid X}(\theta)} \mid X]
+        = \int \mathcal{I}_{Y\mid X = x}(\theta)f_X(x)\,\d x
+\end{displaymath}
+Furthermore, in the case that the distribution of $X$ does not depend on $\theta$, meaning $f_{\theta}(X) = f(X)$, then $\mathcal{I}_X(\theta) = 0$ and $\mathcal{I}_{X, Y}(\theta) = \mathcal{I}_{Y \mid X}(\theta)$.
+
+\end{document}
diff --git a/LaTeX/main.bib b/LaTeX/main.bib
index f0d0dce..f5f984e 100644
--- a/LaTeX/main.bib
+++ b/LaTeX/main.bib
@@ -1,72 +1,96 @@
 @article{RegMatrixReg-ZhouLi2014,
-    author     = {Zhou, Hua and Li, Lexin},
-    title      = {Regularized matrix regression},
-    journal    = {Journal of the Royal Statistical Society. Series B (Statistical Methodology)},
-    volume     = {76},
-    number     = {2},
-    pages      = {463--483},
-    year       = {2014},
-    publisher  = {[Royal Statistical Society, Wiley]}
+    author      = {Zhou, Hua and Li, Lexin},
+    title       = {Regularized matrix regression},
+    journal     = {Journal of the Royal Statistical Society. Series B (Statistical Methodology)},
+    volume      = {76},
+    number      = {2},
+    pages       = {463--483},
+    year        = {2014},
+    publisher   = {[Royal Statistical Society, Wiley]}
 }
 
 @book{StatInf-CasellaBerger2002,
-    title      = {{Statistical Inference}},
-    author     = {Casella, George and Berger, Roger L.},
-    isbn       = {0-534-24312-6},
-    series     = {Duxbury Advanced Series},
-    year       = {2002},
-    edition    = {2},
-    publisher  = {Thomson Learning}
+    title       = {{Statistical Inference}},
+    author      = {Casella, George and Berger, Roger L.},
+    isbn        = {0-534-24312-6},
+    series      = {Duxbury Advanced Series},
+    year        = {2002},
+    edition     = {2},
+    publisher   = {Thomson Learning}
 }
 
 @book{MatrixDiffCalc-MagnusNeudecker1999,
-    title      = {Matrix Differential Calculus with Applications in Statistics and Econometrics (Revised Edition)},
-    author     = {Magnus, Jan R. and Neudecker, Heinz},
-    year       = {1999},
-    publisher  = {John Wiley \& Sons Ltd},
-    isbn       = {0-471-98632-1}
+    title       = {Matrix Differential Calculus with Applications in Statistics and Econometrics (Revised Edition)},
+    author      = {Magnus, Jan R. and Neudecker, Heinz},
+    year        = {1999},
+    publisher   = {John Wiley \& Sons Ltd},
+    isbn        = {0-471-98632-1}
 }
 
 @book{MatrixAlgebra-AbadirMagnus2005,
-    title      = {Matrix Algebra},
-    author     = {Abadir, Karim M. and Magnus, Jan R.},
-    year       = {2005},
-    publisher  = {Cambridge University Press},
-    series     = {Econometric Exercises},
-    collection = {Econometric Exercises},
-    place      = {Cambridge},
-    doi        = {10.1017/CBO9780511810800}
+    title       = {Matrix Algebra},
+    author      = {Abadir, Karim M. and Magnus, Jan R.},
+    year        = {2005},
+    publisher   = {Cambridge University Press},
+    series      = {Econometric Exercises},
+    collection  = {Econometric Exercises},
+    place       = {Cambridge},
+    doi         = {10.1017/CBO9780511810800}
 }
 
 @article{TensorDecomp-HuLeeWang2022,
-    author    = {Hu, Jiaxin and Lee, Chanwoo and Wang, Miaoyan},
-    title     = {Generalized Tensor Decomposition With Features on Multiple Modes},
-    journal   = {Journal of Computational and Graphical Statistics},
-    volume    = {31},
-    number    = {1},
-    pages     = {204-218},
-    year      = {2022},
-    publisher = {Taylor \& Francis},
-    doi       = {10.1080/10618600.2021.1978471},
+    author      = {Hu, Jiaxin and Lee, Chanwoo and Wang, Miaoyan},
+    title       = {Generalized Tensor Decomposition With Features on Multiple Modes},
+    journal     = {Journal of Computational and Graphical Statistics},
+    volume      = {31},
+    number      = {1},
+    pages       = {204-218},
+    year        = {2022},
+    publisher   = {Taylor \& Francis},
+    doi         = {10.1080/10618600.2021.1978471},
 }
 
 @article{CovarEstSparseKron-LengPan2018,
-    author    = {Leng, Chenlei and Pan, Guangming},
-    title     = {{Covariance estimation via sparse Kronecker structures}},
-    volume    = {24},
-    journal   = {Bernoulli},
-    number    = {4B},
-    publisher = {Bernoulli Society for Mathematical Statistics and Probability},
-    pages     = {3833 -- 3863},
-    year      = {2018},
-    doi       = {10.3150/17-BEJ980}
+    author      = {Leng, Chenlei and Pan, Guangming},
+    title       = {{Covariance estimation via sparse Kronecker structures}},
+    volume      = {24},
+    journal     = {Bernoulli},
+    number      = {4B},
+    publisher   = {Bernoulli Society for Mathematical Statistics and Probability},
+    pages       = {3833 -- 3863},
+    year        = {2018},
+    doi         = {10.3150/17-BEJ980}
 }
 
 @article{sdr-PfeifferKaplaBura2021,
-    author   = {Pfeiffer, Ruth and Kapla, Daniel and Bura, Efstathia},
-    title    = {{Least squares and maximum likelihood estimation of sufficient reductions in regressions with matrix-valued predictors}},
-    volume   = {11},
-    year     = {2021},
-    journal  = {International Journal of Data Science and Analytics},
-    doi      = {10.1007/s41060-020-00228-y}
+    author      = {Pfeiffer, Ruth and Kapla, Daniel and Bura, Efstathia},
+    title       = {{Least squares and maximum likelihood estimation of sufficient reductions in regressions with matrix-valued predictors}},
+    volume      = {11},
+    year        = {2021},
+    journal     = {International Journal of Data Science and Analytics},
+    doi         = {10.1007/s41060-020-00228-y}
+}
+
+@article{lsir-PfeifferForzaniBura,
+    author      = {Pfeiffer, Ruth and Forzani, Liliana and Bura, Efstathia},
+    year        = {2012},
+    month       = {09},
+    pages       = {2414-27},
+    title       = {Sufficient dimension reduction for longitudinally measured predictors},
+    volume      = {31},
+    journal     = {Statistics in medicine},
+    doi         = {10.1002/sim.4437}
+}
+
+@Inbook{ApproxKron-VanLoanPitsianis1993,
+    author      = {Van Loan, C. F. and Pitsianis, N.},
+    editor      = {Moonen, Marc S. and Golub, Gene H. and De Moor, Bart L. R.},
+    title       = {Approximation with Kronecker Products},
+    bookTitle   = {Linear Algebra for Large Scale and Real-Time Applications},
+    year        = {1993},
+    publisher   = {Springer Netherlands},
+    address     = {Dordrecht},
+    pages       = {293--314},
+    isbn        = {978-94-015-8196-7},
+    doi         = {10.1007/978-94-015-8196-7_17}
 }
diff --git a/LaTeX/main.tex b/LaTeX/main.tex
index 45f0c29..b569869 100644
--- a/LaTeX/main.tex
+++ b/LaTeX/main.tex
@@ -9,19 +9,21 @@
 \usepackage{makeidx}                    % Index (Symbols, Names, ...)
 \usepackage{xcolor, graphicx}           % colors and including images
 \usepackage{tikz}
+\usetikzlibrary{calc}
 \usepackage[
     % backend=bibtex,
     style=authoryear-comp
 ]{biblatex}
+\usepackage{algorithm, algpseudocode}   % Pseudo Codes / Algorithms
 
 % Document meta into
-\title{Derivation of Gradient Descent Algorithm for K-PIR}
+\title{Higher Order Parametric Inverse Regression HO-PIR}
 \author{Daniel Kapla}
-\date{November 24, 2021}
+\date{\today}
 % Set PDF title, author and creator.
 \AtBeginDocument{
     \hypersetup{
-        pdftitle = {Derivation of Gradient Descent Algorithm for K-PIR},
+        pdftitle = {Higher Order Parametric Inverse Regression HO-PIR},
         pdfauthor = {Daniel Kapla},
         pdfcreator = {\pdftexbanner}
     }
@@ -54,28 +56,26 @@
 \DeclareMathOperator{\kron}{\otimes}    % Kronecker Product
 \DeclareMathOperator{\hada}{\odot}      % Hadamard Product
 \newcommand{\ttm}[1][n]{\times_{#1}}    % n-mode product (Tensor Times Matrix)
-\DeclareMathOperator{\df}{\operatorname{df}}
-\DeclareMathOperator{\tr}{\operatorname{tr}}
+\DeclareMathOperator{\df}{df}
+\DeclareMathOperator{\tr}{tr}
 \DeclareMathOperator{\var}{Var}
 \DeclareMathOperator{\cov}{Cov}
+\DeclareMathOperator{\Span}{Span}
 \DeclareMathOperator{\E}{\operatorname{\mathbb{E}}}
 % \DeclareMathOperator{\independent}{{\bot\!\!\!\bot}}
 \DeclareMathOperator*{\argmin}{{arg\,min}}
 \DeclareMathOperator*{\argmax}{{arg\,max}}
-\newcommand{\D}{\textnormal{D}}
-\renewcommand{\d}{\textnormal{d}}
-\renewcommand{\t}[1]{{#1^{\prime}}}
-\newcommand{\pinv}[1]{{#1^{\dagger}}} % `Moore-Penrose pseudoinverse`
-\newcommand{\todo}[1]{{\color{red}TODO: #1}}
+\newcommand{\D}{\textnormal{D}}         % derivative
+\renewcommand{\d}{\textnormal{d}}       % differential
+\renewcommand{\t}[1]{{#1^{\prime}}}     % matrix transpose
+\newcommand{\pinv}[1]{{#1^{\dagger}}}   % `Moore-Penrose pseudoinverse`
 
-% \DeclareFontFamily{U}{mathx}{\hyphenchar\font45}
-% \DeclareFontShape{U}{mathx}{m}{n}{
-%       <5> <6> <7> <8> <9> <10>
-%       <10.95> <12> <14.4> <17.28> <20.74> <24.88>
-%       mathx10
-%       }{}
-% \DeclareSymbolFont{mathx}{U}{mathx}{m}{n}
-% \DeclareMathSymbol{\bigtimes}{1}{mathx}{"91}
+\newcommand{\todo}[1]{{\color{red}TODO: #1}}
+\newcommand{\effie}[1]{{\color{blue}Effie: #1}}
+
+% Pseudo Code Commands
+\newcommand{\algorithmicbreak}{\textbf{break}}
+\newcommand{\Break}{\State \algorithmicbreak}
 
 \begin{document}
 
@@ -111,9 +111,9 @@ Matrices and tensor can be \emph{vectorized} by the \emph{vectorization} operato
 The rank of a tensor $\ten{A}$ of dimensions $d_1\times ...\times d_r$ is given by a vector $\rank{\ten{A}} = (a_1, ..., a_r)\in[d_1]\times...\times[d_r]$ where $a_k = \rank(\ten{A}_{(k)})$ is the usual matrix rank of the $k$ unfolded tensor.
 
 
-{\color{red}$\mathcal{S}^p$, $\mathcal{S}_{+}^p$, $\mathcal{S}_{++}^p$ symmetric matrices of dimensions $p\times p$, or call it $\operatorname{Sym}(p)$}
+\todo{$\mathcal{S}^p$, $\mathcal{S}_{+}^p$, $\mathcal{S}_{++}^p$ symmetric matrices of dimensions $p\times p$, or call it $\operatorname{Sym}(p)$}
 
-{\color{red}The group of orthogonas matrices $O(p)$ of dim $p\times p$, where $O(p, q)$ are the $p\times q$ matrices (a.k.a. the Stiefel manifold)}
+\todo{The group of orthogonas matrices $O(p)$ of dim $p\times p$, where $O(p, q)$ are the $p\times q$ matrices (a.k.a. the Stiefel manifold)}
 
 
 \section{Tensor Normal Distribution}
@@ -123,14 +123,13 @@ Let $\ten{X}$ be a multi-dimensional array random variable of order $r$ with dim
 \end{displaymath}
 Its density is given by
 \begin{displaymath}
-    f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p_{\lnot i}}} \Big)^{-1}
+    f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{p / p_i}} \Big)^{-1}
         \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
 \end{displaymath}
-where $p_{\lnot i} = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
+where $p = \prod_{i = 1}^r p_i$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
 \begin{displaymath}
-    \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
+    \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1).
 \end{displaymath}
-with $p = \prod_{i = 1}^r p_i$.
 
 \begin{theorem}[Tensor Normal to Multi-Variate Normal equivalence]
     For a multi-dimensional random variable $\ten{X}$ of order $r$ with dimensions $p_1\times ..., p_r$. Let $\ten{\mu}$ be the mean of the same order and dimensions as $\ten{X}$ and the mode covariance matrices $\mat{\Delta}_i$ of dimensions $p_i\times p_i$ for $i = 1, ..., n$. Then the tensor normal distribution is equivalent to the multi-variate normal distribution by the relation
@@ -152,12 +151,12 @@ with $p = \prod_{i = 1}^r p_i$.
     Next, using a property of the determinant of a Kronecker product $|\mat{\Delta}_1\otimes\mat{\Delta}_2| = |\mat{\Delta}_1|^{p_2}|\mat{\Delta}_2|^{p_1}$ yields
     \begin{displaymath}
         |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
-            = |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p_{\lnot 1}}
+            = |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_2|^{p_1}|\mat{\Delta}_1|^{p / p_1}
     \end{displaymath}
-    where $p_{\lnot i} = \prod_{j \neq i}p_j$. By induction over $r$ the relation
+    where $p = \prod_{j = 1}^r p_j$. By induction over $r$ the relation
     \begin{displaymath}
         |\mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1|
-            = \prod_{i = 1}^r |\mat{\Delta}_i|^{p_{\lnot i}}
+            = \prod_{i = 1}^r |\mat{\Delta}_i|^{p / p_i}
     \end{displaymath}
     holds for arbitrary order $r$. Substituting into the Tensor Normal density leads to
     \begin{align*}
@@ -176,224 +175,851 @@ When sampling from the Multi-Array Normal one way is to sample from the Multi-Va
 where the sampling from the standard Multi-Array Normal is done by sampling all of the elements of $\ten{Z}$ from a standard Normal.
 
 
-\section{Introduction}
-\todo{rewrite this section to multi-variate arrays (tensors)}
-We assume the model
+% \section{Introduction}
+% We assume the model
+% \begin{displaymath}
+%     \ten{X} = \ten{\mu} + \ten{F} \times \{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}
+% \end{displaymath}
+% where the dimensions of all the components are listed in Table~\ref{tab:dimensions}.
+% % and its vectorized form
+% % \begin{displaymath}
+% %     \vec\mat{X} = \vec\mat{\mu} + (\mat{\alpha}\kron\mat{\beta})\vec\mat{f}_y + \vec\mat{\epsilon}
+% % \end{displaymath}
+
+% \begin{table}[!htp]
+%     \centering
+%     \begin{minipage}{0.8\textwidth}
+%     \centering
+%     \begin{tabular}{l l l}
+%         $\mat X, \mat\mu, \mat R, \mat\epsilon$     & $p_1\times p_2$ \\
+%         $\ten X, \ten\mu, \ten R, \ten\epsilon$     & $p_1\times ...\times p_r$ & {\small\color{gray} (Population Level)} \\
+%         $\ten X, \ten\mu, \ten R, \ten\epsilon$     & $p_1\times ...\times p_r\times n$ & {\small\color{gray} (Sample Level)} \\
+%         $\mat{f}_y$     & $q_1\times q_2$ \\
+%         $\ten{F}$     & $q_1\times ...\times q_r$ & {\small\color{gray} (Population Level)} \\
+%         $\ten{F}$     & $q_1\times ...\times q_r\times n$ & {\small\color{gray} (Sample Level)} \\
+%         $\mat{\alpha}_j$                                & $p_j\times q_j$ & $j = 1, ..., r$ \\
+%         $\mat\Delta_j$                              & $q\times q$ & $j = 1, ..., r$
+%     \end{tabular}
+%     \caption{\label{tab:dimensions}Summary listing of dimensions.}
+%     \end{minipage}
+% \end{table}
+
+% The log-likelihood $l$ given $n$ i.i.d. observations assuming that $\mat{X}_i\mid(Y = y_i)$ is normal distributed as
+% \begin{displaymath}
+%     \vec\mat{X}_i \sim \mathcal{N}_{p q}(\vec\mat\mu + (\mat\alpha\kron\mat\beta)\vec\mat{f}_{y_i}, \Delta)
+% \end{displaymath}
+% Replacing all unknown by there estimates gives the (estimated) log-likelihood
+% \begin{equation}\label{eq:log-likelihood-est}
+%     \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i
+% \end{equation}
+% where the residuals are
+% \begin{displaymath}
+%     \mat{r}_i = \vec\mat{X}_i - \vec\overline{\mat{X}} - (\mat\alpha\kron\mat\beta)\vec{\mat f}_{y_i}\qquad (p q \times 1)
+% \end{displaymath}
+% and the MLE estimate assuming $\mat\alpha, \mat\beta$ known for the covariance matrix $\widehat{\mat\Delta}$ as solution to the score equations is
+% \begin{equation}\label{eq:Delta}
+%     \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat{r}_i \t{\mat{r}_i} \qquad(p q \times p q).
+% \end{equation}
+% Note that the log-likelihood estimate $\hat{l}$ only depends on $\mat\alpha, \mat\beta$. Next, we compute the gradient for $\mat\alpha$ and $\mat\beta$ of $\hat{l}$ used to formulate a Gradient Descent base estimation algorithm for $\mat\alpha, \mat\beta$ as the previous algorithmic. The main reason is to enable an estimation for bigger dimensions of the $\mat\alpha, \mat\beta$ coefficients since the previous algorithm does \emph{not} solve the high run time problem for bigger dimensions.
+
+% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% %%%                                Derivative                                %%%
+% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% \section{Derivative of the Log-Likelihood}
+% Start with the general case of $\mat X_i|(Y_i = y_i)$ is multivariate normal distributed with the covariance $\mat\Delta$ being a $p q\times p q$ positive definite symmetric matrix \emph{without} an further assumptions. We have $i = 1, ..., n$ observations following
+% \begin{displaymath}
+%     \mat{r}_i = \vec(\mat X_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha}) \sim \mathcal{N}_{p q}(\mat 0, \mat\Delta).
+% \end{displaymath}
+% The MLE estimates of $\mat\mu, \mat\Delta$ are
+% \begin{displaymath}
+%     \widehat{\mat\mu} = \overline{\mat X} = \frac{1}{n}\sum_{i = 1}^n \mat X_i {\color{gray}\qquad(p\times q)},
+%     \qquad \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat r_i\t{\mat r_i} {\color{gray}\qquad(p q\times p q)}.
+% \end{displaymath}
+% Substitution of the MLE estimates into the log-likelihood $l(\mat\mu, \mat\Delta, \mat\alpha, \mat\beta)$ gives the estimated log-likelihood $\hat{l}(\mat\alpha, \mat\beta)$ as
+% \begin{displaymath}
+%     \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i.
+% \end{displaymath}
+% We are interested in the gradients $\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)$, $\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)$ of the estimated log-likelihood. Therefore, we consider the differential of $\hat{l}$.
+% \begin{align}
+%     \d\hat{l}(\mat\alpha, \mat\beta)
+%         &= -\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \big(\t{(\d \mat{r}_i)}\widehat{\mat{\Delta}}^{-1} \mat{r}_i + \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i + \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i\big) \nonumber\\
+%         &= \underbrace{-\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i}_{=\,0\text{ due to }\widehat{\mat{\Delta}}\text{ beeing the MLE}} \label{eq:deriv1}
+%             - \sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i.
+% \end{align}
+% The next step is to compute $\d \mat{r}_i$ which depends on both $\mat\alpha$ and $\mat\beta$
+% \begin{align*}
+%     \d\mat{r}_i(\mat\alpha, \mat\beta)
+%         &= -\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \\
+%         &= -\vec\!\big( \mat{I}_{p q}\,\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \big) \\
+%         &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})\,\d\vec(\mat\alpha\kron \mat\beta) \\
+% \intertext{using the identity \ref{eq:vecKron}, to obtain vectorized differentials, gives}
+%     \dots
+%         &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \,\d(\vec \mat\alpha\kron \vec \mat\beta) \\
+%         &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\d\vec \mat\alpha)\kron \vec \mat\beta + \vec \mat\alpha\kron (\d\vec \mat\beta)\big) \\
+%         &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big(\mat{I}_{r q}(\d\vec \mat\alpha)\kron (\vec \mat\beta)\mat{I}_1 + (\vec \mat\alpha)\mat{I}_1\kron \mat{I}_{k p}(\d\vec \mat\beta)\big) \\
+%         &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\mat{I}_{r q}\kron\vec \mat\beta)\d\vec \mat\alpha + (\vec \mat\alpha\kron \mat{I}_{k p})\d\vec \mat\beta\big)
+% \end{align*}
+% Now, substitution of $\d\mat{r}_i$ into \eqref{eq:deriv1} gives the gradients (not dimension standardized versions of $\D\hat{l}(\mat\alpha)$, $\D\hat{l}(\mat\beta)$) by identification of the derivatives from the differentials (see: \todo{appendix})
+% \begin{align*}
+%     \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
+%         \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\mat{I}_{r q}\kron\vec \mat\beta),
+%         {\color{gray}\qquad(q\times r)} \\
+%     \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)  &=
+%         \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\vec \mat\alpha\kron \mat{I}_{k p}).
+%         {\color{gray}\qquad(p\times k)}
+% \end{align*}
+% These quantities are very verbose as well as completely unusable for an implementation. By detailed analysis of the gradients we see that the main parts are only element permutations with a high sparsity. By defining the following compact matrix
+% \begin{equation}\label{eq:permTransResponse}
+%     \mat G = \vec^{-1}_{q r}\bigg(\Big( \sum_{j = 1}^n \vec\mat{f}_{y_j}\otimes \widehat{\mat\Delta}^{-1}\mat{r}_j \Big)_{\pi(i)}\bigg)_{i = 1}^{p q k r}{\color{gray}\qquad(q r \times p k)}
+% \end{equation}
+% with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting the axis of a 4D tensor of dimensions $p\times q\times k\times r$ by $(2, 4, 1, 3)$. As a generalization of transposition this leads to a rearrangement of the elements corresponding to the permuted 4D tensor with dimensions $q\times r\times p\times k$ which is then vectorized and reshaped into a matrix of dimensions $q r \times p k$. With $\mat G$ the gradients simplify to \todo{validate this mathematically}
+% \begin{align*}
+%     \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
+%         \vec_{q}^{-1}(\mat{G}\vec{\mat\beta}),
+%         {\color{gray}\qquad(q\times r)} \\
+%     \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)  &=
+%         \vec_{p}^{-1}(\t{\mat{G}}\vec{\mat\alpha}).
+%         {\color{gray}\qquad(p\times k)}
+% \end{align*}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Kronecker Covariance Structure Model}\label{sec:kron_cov}
+% \section{Kronecker Covariance Structure}\label{sec:kron_cov}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+Let $\ten{X}, \ten{F}$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. We assume the population model
+\begin{align*}
+    \ten{X} &= \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon} \\
+        &= \ten{\mu} + \ten{F}\times_{j\in[r]}\mat{\alpha}_j + \ten{\epsilon}
+\end{align*}
+where the error tensor $\epsilon$ is of the same order and dimensions as $\ten{X}$. The distribution of the error tensor is assumed to be mean zero tensor distributed $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for symmetric, positive definite covariance matrices $\mat{\Delta}_j$, $j = 1, ..., r$.
+
+Given $i = 1, ..., n$ i.i.d. observations $(\ten{X}_i, \ten{F}_i)$, the sample model analog is
+\begin{align}
+    \ten{X} &= \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon} \nonumber \\
+        &= \ten{\mu} + \ten{F}\times_{j\in[r]}\mat{\alpha}_j + \ten{\epsilon} \label{eq:sample_model}
+\end{align}
+where the model tensors $\ten{X}, \ten{F}$ collect all observations on an additional sample axis in the last mode, making them tensors of order $r + 1$. Meaning that $\ten{X}$, $\ten{\mu}$ and $\ten{\epsilon}$ have dimensions $p_1\times ...\times p_r\times n$ and $\ten{F}$ is of dimensions $q_1\times ...\times q_r\times n$. The mean tensor $\ten{\mu}$ replicates its entries $\ten{\mu}_i = \ten{\mu}_1$, $i = 1, ..., n$. Let the estimated residual tensor be
 \begin{displaymath}
-    \mat{X} = \mat{\mu} + \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}
-\end{displaymath}
-where the dimensions of all the components are listed in Table~\ref{tab:dimensions}.
-and its vectorized form
-\begin{displaymath}
-    \vec\mat{X} = \vec\mat{\mu} + (\mat{\alpha}\kron\mat{\beta})\vec\mat{f}_y + \vec\mat{\epsilon}
+    \ten{R} = \ten{X} - \ten{\mu} - \ten{F}\times_{j\in[r]}\mat{\alpha}_j.
 \end{displaymath}
 
+In the following we assume w.l.o.g. that that the mean tensor $\ten{\mu} = 0$.
+
 \begin{table}[!htp]
     \centering
     \begin{minipage}{0.8\textwidth}
     \centering
-    \begin{tabular}{l l}
-        $\mat X, \mat\mu, \mat R, \mat\epsilon$     & $p\times q$ \\
-        $\mat{f}_y$                                 & $k\times r$ \\
-        $\mat\alpha$                                & $q\times r$ \\
-        $\mat\beta$                                 & $p\times k$ \\
-        $\mat\Delta$                                & $p q\times p q$ \\
-        $\mat\Delta_1$                              & $q\times q$ \\
-        $\mat\Delta_2$                              & $p\times p$ \\
-        $\mat{r}$                                   & $p q\times 1$ \\
-        \hline
-        $\ten{X}, \ten{R}$                          & $n\times p\times q$ \\
-        $\ten{F}$                                   & $n\times k\times r$ \\
+    \begin{tabular}{l l l}
+        $\ten X, \ten\mu, \ten R, \ten\epsilon$     & $p_1\times ...\times p_r$ & {\small\color{gray} (Population Level)} \\
+        $\ten X, \ten\mu, \ten R, \ten\epsilon$     & $p_1\times ...\times p_r\times n$ & {\small\color{gray} (Sample Level)} \\
+        $\ten{F}$     & $q_1\times ...\times q_r$ & {\small\color{gray} (Population Level)} \\
+        $\ten{F}$     & $q_1\times ...\times q_r\times n$ & {\small\color{gray} (Sample Level)} \\
+        $\mat{\alpha}_j$                                & $p_j\times q_j$ & $j = 1, ..., r$ \\
+        $\mat\Delta_j$                              & $q\times q$ & $j = 1, ..., r$
     \end{tabular}
-    \caption{\label{tab:dimensions}\small Summary listing of dimensions with the corresponding sample versions $\mat{X}_i, \mat{R}_i, \mat{r}_i, \mat{f}_{y_i}$ for $i = 1, ..., n$ as well as estimates $\widehat{\mat{\alpha}}, \widehat{\mat{\beta}}, \widehat{\mat\Delta}, \widehat{\mat\Delta}_1$ and $\widehat{\mat\Delta}_2$.}
+    \caption{\label{tab:dimensions}Summary listing of dimensions.}
     \end{minipage}
 \end{table}
 
-The log-likelihood $l$ given $n$ i.i.d. observations assuming that $\mat{X}_i\mid(Y = y_i)$ is normal distributed as
-\begin{displaymath}
-    \vec\mat{X}_i \sim \mathcal{N}_{p q}(\vec\mat\mu + (\mat\alpha\kron\mat\beta)\vec\mat{f}_{y_i}, \Delta)
-\end{displaymath}
-Replacing all unknown by there estimates gives the (estimated) log-likelihood
-\begin{equation}\label{eq:log-likelihood-est}
-    \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i
-\end{equation}
-where the residuals are
-\begin{displaymath}
-    \mat{r}_i = \vec\mat{X}_i - \vec\overline{\mat{X}} - (\mat\alpha\kron\mat\beta)\vec{\mat f}_{y_i}\qquad (p q \times 1)
-\end{displaymath}
-and the MLE estimate assuming $\mat\alpha, \mat\beta$ known for the covariance matrix $\widehat{\mat\Delta}$ as solution to the score equations is
-\begin{equation}\label{eq:Delta}
-    \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat{r}_i \t{\mat{r}_i} \qquad(p q \times p q).
-\end{equation}
-Note that the log-likelihood estimate $\hat{l}$ only depends on $\mat\alpha, \mat\beta$. Next, we compute the gradient for $\mat\alpha$ and $\mat\beta$ of $\hat{l}$ used to formulate a Gradient Descent base estimation algorithm for $\mat\alpha, \mat\beta$ as the previous algorithmic. The main reason is to enable an estimation for bigger dimensions of the $\mat\alpha, \mat\beta$ coefficients since the previous algorithm does \emph{not} solve the high run time problem for bigger dimensions.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Least Squares estimates}\label{sec:ls}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                                Derivative                                %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Derivative of the Log-Likelihood}
-Start with the general case of $\mat X_i|(Y_i = y_i)$ is multivariate normal distributed with the covariance $\mat\Delta$ being a $p q\times p q$ positive definite symmetric matrix \emph{without} an further assumptions. We have $i = 1, ..., n$ observations following
+The least squares estimates for $\mat{\alpha}_j$, $j = 1, ..., r$ given $n$ i.i.d. observations $(\ten{X}_i, \ten{F}_i)$ are the solution to the minimization problem
 \begin{displaymath}
-    \mat{r}_i = \vec(\mat X_i - \mat\mu - \mat\beta\mat{f}_{y_i}\t{\mat\alpha}) \sim \mathcal{N}_{p q}(\mat 0, \mat\Delta).
+    \widehat{\mat{\alpha}}_1, ..., \widehat{\mat{\alpha}}_r
+        = \argmin_{\mat{\alpha}_1, ..., \mat{\alpha}_r} \| \ten{X} - \ten{F}\times_{j\in[r]}\mat{\alpha}_j \|_F^2.
 \end{displaymath}
-The MLE estimates of $\mat\mu, \mat\Delta$ are
-\begin{displaymath}
-    \widehat{\mat\mu} = \overline{\mat X} = \frac{1}{n}\sum_{i = 1}^n \mat X_i {\color{gray}\qquad(p\times q)},
-    \qquad \widehat{\mat\Delta} = \frac{1}{n}\sum_{i = 1}^n \mat r_i\t{\mat r_i} {\color{gray}\qquad(p q\times p q)}.
-\end{displaymath}
-Substitution of the MLE estimates into the log-likelihood $l(\mat\mu, \mat\Delta, \mat\alpha, \mat\beta)$ gives the estimated log-likelihood $\hat{l}(\mat\alpha, \mat\beta)$ as
-\begin{displaymath}
-    \hat{l}(\mat\alpha, \mat\beta) = -\frac{n q p}{2}\log 2\pi - \frac{n}{2}\log|\widehat{\mat\Delta}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat\Delta}^{-1}\mat{r}_i.
-\end{displaymath}
-We are interested in the gradients $\nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta)$, $\nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)$ of the estimated log-likelihood. Therefore, we consider the differential of $\hat{l}$.
-\begin{align}
-    \d\hat{l}(\mat\alpha, \mat\beta)
-        &= -\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \big(\t{(\d \mat{r}_i)}\widehat{\mat{\Delta}}^{-1} \mat{r}_i + \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i + \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i\big) \nonumber\\
-        &= \underbrace{-\frac{n}{2}\log|\widehat{\mat{\Delta}}| - \frac{1}{2}\sum_{i = 1}^n \t{\mat{r}_i}(\d\widehat{\mat{\Delta}}^{-1}) \mat{r}_i}_{=\,0\text{ due to }\widehat{\mat{\Delta}}\text{ beeing the MLE}} \label{eq:deriv1}
-            - \sum_{i = 1}^n \t{\mat{r}_i}\widehat{\mat{\Delta}}^{-1} \d \mat{r}_i.
-\end{align}
-The next step is to compute $\d \mat{r}_i$ which depends on both $\mat\alpha$ and $\mat\beta$
+With the identities $\|\ten{A}\|_F^2 = \tr(\ten{A}_{(j)}\t{\ten{A}_{(j)}})$ and $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ for any $j$ it followds that the differential of the Frobenius norm is equal to
 \begin{align*}
-    \d\mat{r}_i(\mat\alpha, \mat\beta)
-        &= -\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \\
-        &= -\vec\!\big( \mat{I}_{p q}\,\d(\mat\alpha\kron \mat\beta)\vec\mat{f}_{y_i} \big) \\
-        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})\,\d\vec(\mat\alpha\kron \mat\beta) \\
-\intertext{using the identity \ref{eq:vecKron}, to obtain vectorized differentials, gives}
-    \dots
-        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \,\d(\vec \mat\alpha\kron \vec \mat\beta) \\
-        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\d\vec \mat\alpha)\kron \vec \mat\beta + \vec \mat\alpha\kron (\d\vec \mat\beta)\big) \\
-        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big(\mat{I}_{r q}(\d\vec \mat\alpha)\kron (\vec \mat\beta)\mat{I}_1 + (\vec \mat\alpha)\mat{I}_1\kron \mat{I}_{k p}(\d\vec \mat\beta)\big) \\
-        &= -(\t{\vec(\mat{f}_{y_i})}\kron \mat{I}_{p q})(\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) \big((\mat{I}_{r q}\kron\vec \mat\beta)\d\vec \mat\alpha + (\vec \mat\alpha\kron \mat{I}_{k p})\d\vec \mat\beta\big)
+    \d \| \ten{X} - \ten{F}\times_{j\in[r]}\mat{\alpha}_j \|_F^2
+        &= -2 \sum_{j = 1}^k \tr\Big( (\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k\times_j\d\mat{\alpha}_j)_{(j)}\t{(\ten{X} - \ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)}} \Big) \\
+        &= -2 \sum_{j = 1}^k \tr\Big( (\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{X} - \ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)}}\d\mat{\alpha}_j \Big).
 \end{align*}
-Now, substitution of $\d\mat{r}_i$ into \eqref{eq:deriv1} gives the gradients (not dimension standardized versions of $\D\hat{l}(\mat\alpha)$, $\D\hat{l}(\mat\beta)$) by identification of the derivatives from the differentials (see: \todo{appendix})
-\begin{align*}
-    \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
-        \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\mat{I}_{r q}\kron\vec \mat\beta),
-        {\color{gray}\qquad(q\times r)} \\
-    \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)  &=
-        \sum_{i = 1}^n (\t{\vec(\mat{f}_{y_i})}\kron\t{\mat{r}_i}\widehat{\mat\Delta}^{-1}) (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p) (\vec \mat\alpha\kron \mat{I}_{k p}).
-        {\color{gray}\qquad(p\times k)}
-\end{align*}
-These quantities are very verbose as well as completely unusable for an implementation. By detailed analysis of the gradients we see that the main parts are only element permutations with a high sparsity. By defining the following compact matrix
-\begin{equation}\label{eq:permTransResponse}
-    \mat G = \vec^{-1}_{q r}\bigg(\Big( \sum_{j = 1}^n \vec\mat{f}_{y_j}\otimes \widehat{\mat\Delta}^{-1}\mat{r}_j \Big)_{\pi(i)}\bigg)_{i = 1}^{p q k r}{\color{gray}\qquad(q r \times p k)}
+Equating to zero for each differential leads to normal equations for the $\mat{\alpha}_j$ parameter matrices
+\begin{displaymath}
+    (\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{\ten{X}_{(j)}} = (\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)}}, \qquad j = 1, ..., r
+\end{displaymath}
+where the normal equations for $\mat{\alpha}_j$ depend on all the other $\mat{\alpha}_k$. Solving for $\mat{\alpha}_j$ gives a system of equations
+\begin{equation}\label{eq:ls_est_alphas}
+    \widehat{\mat{\alpha}}_j = \ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1}, \qquad j = 1, ..., r.
 \end{equation}
-with $\pi$ being a permutation of $p q k r$ elements corresponding to permuting the axis of a 4D tensor of dimensions $p\times q\times k\times r$ by $(2, 4, 1, 3)$. As a generalization of transposition this leads to a rearrangement of the elements corresponding to the permuted 4D tensor with dimensions $q\times r\times p\times k$ which is then vectorized and reshaped into a matrix of dimensions $q r \times p k$. With $\mat G$ the gradients simplify to \todo{validate this mathematically}
-\begin{align*}
-    \nabla_{\mat\alpha}\hat{l}(\mat\alpha, \mat\beta) &=
-        \vec_{q}^{-1}(\mat{G}\vec{\mat\beta}),
-        {\color{gray}\qquad(q\times r)} \\
-    \nabla_{\mat\beta}\hat{l}(\mat\alpha, \mat\beta)  &=
-        \vec_{p}^{-1}(\t{\mat{G}}\vec{\mat\alpha}).
-        {\color{gray}\qquad(p\times k)}
-\end{align*}
+
+\begin{example}[Vector Valued LS ($r = 1$)]\label{ex:ls_vector_case}
+    Considering the vector valued case ($r = 1$), then the sample tensors $\ten{F} = \ten{F}_{(1)} = \t{\mat{F}}$ and $\ten{X} = \ten{X}_{(1)} = \t{\mat{X}}$ which are both matrices of dimensions $n\times p$ and $n\times q$, respectively. The LS estimate for the single parameter matrix $\mat{\alpha} = \mat{\alpha}_1$ is
+    \begin{displaymath}
+        \widehat{\mat{\alpha}}
+            = \ten{X}_{(1)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(1)}}[(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(1)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(1)}}]^{-1}
+            = \ten{X}_{(1)}\t{\ten{F}_{(1)}}(\ten{F}_{(1)}\t{\ten{F}_{(1)}})^{-1}
+            = \t{\mat{X}}\mat{F}(\t{\mat{F}}\mat{F})^{-1}.
+    \end{displaymath}
+\end{example}
+
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                      Kronecker Covariance Structure                      %%%
+\subsection{MLE estimates}\label{sec:mle}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Kronecker Covariance Structure}
-As before we let the sample model for tensor valued opbservations and responses or oder $r$ be
-\begin{displaymath}
-    \ten{X} = \ten{\mu} + \ten{F}\times_{j\in[r]}\mat{\alpha}_j + \ten{\epsilon}
-\end{displaymath}
-but the error tensor $\ten{\epsilon}\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ is Tensor Normal distributed with mean zero and covariance matrices $\mat{\Delta}_1, ..., \mat{\Delta}_r$.
 
-The sample model for $n$ observations has the same form with an additional sample axis in the last mode of $\ten{X}$ and $\ten{Y}$ with dimensions $p_1\times ...\times p_r\times n$ and $q_1\times ...\times q_r\times n$, respectively. Let the residual tensor be
-\begin{displaymath}
-    \ten{R} = \ten{X} - \ten{\mu} - \ten{F}\times_{j\in[r]}\mat{\alpha}_j.
-\end{displaymath}
-
-By the definition of the Tensor Normal, using the notation $p_{\lnot j} = \prod_{k\neq j}p_j$, we get for observations $\ten{X}, \ten{F}$ the log-likelihood in terms of the residuals as
+By the definition of the Tensor Normal, with $p = \prod_{j = 1}^r p_j$, we get for $n$ i.i.d. observations $\ten{X}_i, \ten{F}_i$ the log-likelihood in terms of the residuals as
 \begin{displaymath}
     l = -\frac{n p}{2}\log 2\pi
-        -\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\log|\mat{\Delta}_j|
+        -\sum_{j = 1}^r \frac{n p}{2 p_j}\log|\mat{\Delta}_j|
         -\frac{1}{2}\langle \ten{R}\times_{j\in[r]}\mat{\Delta}_j^{-1}, \ten{R} \rangle.
 \end{displaymath}
-Note that the log-likelihood depends on the covariance matrices $\mat{\Delta}_j$, $j = 1, ..., r$ as well as the mean $\mu$ and the parameter matrices $\mat{\alpha}_j$, $j = 1, ..., r$ through the residuals $\ten{R}$.
+Note that the log-likelihood depends not only on the covariance matrices $\mat{\Delta}_j$, $j = 1, ..., r$ but also on the parameter matrices $\mat{\alpha}_j$, $j = 1, ..., r$ through the residuals $\ten{R}$ (mean $\mu = 0$ is assumed).
 
-\subsection{MLE estimates}
 For deriving the MLE estimates we compute the differential of the log-likelihood given the data as
 \begin{displaymath}
     \d l =
-        -\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\d\log|{\mat{\Delta}}_j|
+        -\sum_{j = 1}^r \frac{n p}{2 p_j}\d\log|{\mat{\Delta}}_j|
         -\frac{1}{2}\sum_{j = 1}^r\langle {\ten{R}}\times_{k\in[r]\backslash j}{\mat{\Delta}}_k^{-1}\times_j\d{\mat{\Delta}}^{-1}_j, \ten{R} \rangle
         -\langle {\ten{R}}\times_{j\in[r]}{\mat{\Delta}}_j^{-1}, \d{\ten{R}} \rangle.
 \end{displaymath}
-Using $\d\log|\mat{A}| = \tr(\mat{A}^{-1}\d\mat{A})$ and $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$ as well as $\langle\ten{A}, \ten{B}\rangle = \tr(\ten{A}_{(j)}\t{\ten{B}_{(j)}})$ for any $j = 1, ..., r$ we get the differential of the estimated log-likelihood as
+Using $\d\log|\mat{A}| = \tr(\mat{A}^{-1}\d\mat{A})$ and $\d\mat{A}^{-1} = -\mat{A}^{-1}(\d\mat{A})\mat{A}^{-1}$ as well as $\langle\ten{A}, \ten{B}\rangle = \tr(\ten{A}_{(j)}\t{\ten{B}_{(j)}})$ for any $j = 1, ..., r$ we get the differential of the log-likelihood as
 \begin{align*}
-    \d \hat{l}
+    \d l
     &=
-        -\sum_{j = 1}^r \frac{n p_{\lnot j}}{2}\tr(\widehat{\mat{\Delta}}_j^{-1}\d\widehat{\mat{\Delta}}_j)
-        -\frac{1}{2}\sum_{j = 1}^r\tr\!\Big((\d{\widehat{\mat{\Delta}}}^{-1}_j)({\ten{R}}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}\Big)
-        -\langle {\ten{R}}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d{\ten{R}} \rangle \\
+        -\sum_{j = 1}^r \frac{n p}{2 p_j}\tr({\mat{\Delta}}_j^{-1}\d{\mat{\Delta}}_j)
+        -\frac{1}{2}\sum_{j = 1}^r\tr\!\Big((\d{{\mat{\Delta}}}^{-1}_j)({\ten{R}}\times_{k\in[r]\backslash j}{{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}\Big)
+        -\langle {\ten{R}}\times_{j\in[r]}{{\mat{\Delta}}}_j^{-1}, \d{\ten{R}} \rangle \\
     &=
         -\frac{1}{2}\sum_{j = 1}^r \tr\left(\Big(
-            n p_{\lnot j}\mat{I}_{p_j} - \widehat{\mat{\Delta}}_j^{-1}(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}
-        \right)\widehat{\mat{\Delta}}_j^{-1}\d\widehat{\mat{\Delta}}_j\Big)
-        -\langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle.
+            \frac{n p}{p_j}\mat{I}_{p_j} - {\mat{\Delta}}_j^{-1}(\ten{R}\times_{k\in[r]\backslash j}{{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}
+        \right){\mat{\Delta}}_j^{-1}\d{\mat{\Delta}}_j\Big)
+        -\langle \ten{R}\times_{j\in[r]}{{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle.
 \end{align*}
-With $\ten{R}$ not dependent on the $\widehat{\mat{\Delta}}_j$'s we get the covariance MLE estimates
+With $\ten{R}$ not dependent on the $\mat{\Delta}_j$'s we get the covariance MLE estimates
+\begin{equation}\label{eq:mle_est_Deltas}
+    \widehat{\mat{\Delta}}_j = \frac{p_j}{n p}(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}
+        \qquad{\color{gray}p_j\times p_j}, \qquad j = 1, ..., r.
+\end{equation}
+At the same time with the same argument the gradients with respect to the covariance matrices are given by
 \begin{align*}
-    \widehat{\mat{\Delta}}_j &= \frac{1}{n p_{\lnot j}}(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{\ten{R}_{(j)}}
-        \qquad{\color{gray}p_j\times p_j} && j = 1, ..., r.
-\end{align*}
-as well as gradients (even though they are not realy used, except in the case of a pure gradient based estimation procedure which might ease the estimation burden as all the MLE estimates are cross dependent)
-\begin{align*}
-    \nabla_{\widehat{\mat{\Delta}}_j}\hat{l} &= \frac{1}{2}\widehat{\mat{\Delta}}_j^{-1}\big(
-        \ten{R}_{(j)}\t{(\ten{R}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}}\widehat{\mat{\Delta}}_j^{-1} - n p_{\lnot j}\mat{I}_{p_j}
+    \nabla_{{\mat{\Delta}}_j}l &= \frac{1}{2}{\mat{\Delta}}_j^{-1}\big(
+        \ten{R}_{(j)}\t{(\ten{R}\times_{k\in[r]\backslash j}{{\mat{\Delta}}}_k^{-1})_{(j)}}{\mat{\Delta}}_j^{-1} - \frac{n p}{p_j}\mat{I}_{p_j}
     \big) \\
-        &= \frac{1}{2}\widehat{\mat{\Delta}}_j^{-1}\big(
-            \ten{R}_{(j)}\t{(\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}} - n p_{\lnot j}\mat{I}_{p_j}
+        &= \frac{1}{2}{\mat{\Delta}}_j^{-1}\big(
+            \ten{R}_{(j)}\t{(\ten{R}\times_{k\in[r]}{{\mat{\Delta}}}_k^{-1})_{(j)}} - \frac{n p}{p_j}\mat{I}_{p_j}
         \big)
         \qquad{\color{gray}p_j\times p_j} && j = 1, ..., r.
 \end{align*}
-We continue by substitution of the covariance estimates and get
+The remaining part is
 \begin{align*}
-    \d\hat{l} &= -\langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle \\
-        &= \sum_{j = 1}^r \langle \ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1}, \ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k\times_j\d\widehat{\mat{\alpha}}_j \rangle \\
-        &= \sum_{j = 1}^r \tr\big( (\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}}\d\widehat{\mat{\alpha}}_j \big).
+    \langle \ten{R}\times_{j\in[r]}{\widehat{\mat{\Delta}}}_j^{-1}, \d\ten{R} \rangle
+        &= -\sum_{j = 1}^r \langle \ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1}, \ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k\times_j\d\widehat{\mat{\alpha}}_j \rangle \\
+        &= -\sum_{j = 1}^r \tr\big( (\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}}\d\widehat{\mat{\alpha}}_j \big).
 \end{align*}
 Through that the gradient for all the parameter matrices is
 \begin{align*}
-    \nabla_{\widehat{\mat{\alpha}}_j}\hat{l} &= (\ten{R}\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}
+    \nabla_{\mat{\alpha}_j}l &= (\ten{R}\times_{k\in[r]}{{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}{\mat{\alpha}}_k)_{(j)}}
         \qquad{\color{gray}p_j\times q_j} && j = 1, ..., r.
 \end{align*}
-By equating the gradients to zero and expanding $\ten{R}$ we get a system of normal equations
+By equating the gradients to zero and expanding $\ten{R}$ we get a system of equations
 \begin{gather*}
-    0\overset{!}{=} \nabla_{\widehat{\mat{\alpha}}_j}\hat{l}
-    = ((\ten{X}-\widehat{\ten{\mu}}-\ten{F}\times_{l\in[r]}\widehat{\mat{\alpha}}_l)\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}} \\
+    0\overset{!}{=} \nabla_{\mat{\alpha}_j} l
+    = ((\ten{X}-\ten{F}\times_{l\in[r]}\mat{\alpha}_l)\times_{k\in[r]}{\mat{\Delta}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} \\
     %
-    ((\ten{X}-\widehat{\ten{\mu}})\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}} = (\ten{F}\times_{k\in[r]}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}
+    (\ten{X}\times_{k\in[r]}{\mat{\Delta}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} = (\ten{F}\times_{k\in[r]}\mat{\Delta}_k^{-1}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}.
 \end{gather*}
-with cross dependent solutions for the MLE estimates $\widehat{\mat{\alpha}}$ given by
-\begin{displaymath}
-    \widehat{\mat{\alpha}}_j = \widehat{\mat{\Delta}}_j((\ten{X}-\widehat{\ten{\mu}})\times_{k\in[r]}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1}.
-\end{displaymath}
+Solving for $\mat{\alpha}_j$ in conjunction with the MLE estimates for the $\mat{\Delta}_j$'s gives a cross dependent system of equations for the $\mat{\alpha}_j$ MLE estimates
+\begin{equation}
+    \widehat{\mat{\alpha}}_j = (\ten{X}\times_{k\in[r]\backslash j}{\widehat{\mat{\Delta}}}_k^{-1})_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1} \label{eq:mle_est_alphas}
+\end{equation}
 for $j = 1, ..., r$.
 
-\begin{example}[Simple multivariate case ($r = 1$)]
-    Lets consider the case of $r = 1$, this is the usual multivariate regression case for vector valued predictors. Here $\mat{F}$ denotes the $n\times q$ centered model matrix and the responses are collected in the $n\times p$ matrix $\mat{X}$ with the $n$ observations in the rows and the variables by columns. Furthermore, let the residual estimate matrix $\mat{R} = \mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1}$. In this case the tensor MLE estimates simplify to there well known form
+\begin{remark}
+    Note the similarity to the LS estimates but also that they are \emph{not} identical. Its a well known fact that the LS and MLE estimates under the Multivariate Normal model are identical. This seems to be violated, but this is \emph{not} the case because the equivalency only holds for the unstructured case. Both the LS and MLE solutions simplify in the unstructured case ($\ten{X}_i, \ten{F}_i$ are of order $r = 1$, e.g. vector valued) to the same well known solution, compare Example~\ref{ex:ls_vector_case} and Example~\ref{ex:mle_vector_case}.
+\end{remark}
+
+\begin{example}[Vector Valued MLE ($r = 1$)]\label{ex:mle_vector_case}
+    Like in Example~\ref{ex:ls_vector_case} let the observations be vector valued. We get $\ten{F} = \ten{F}_{(1)} = \t{\mat{F}}$ and $\ten{X} = \ten{X}_{(1)} = \t{\mat{X}}$ which are both matrices of dimensions $n\times p$ and $n\times q$, respectively. The estimated residuals are $\ten{R} = \ten{R}_{(1)} = \t{\mat{R}} = \t{(\mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}})}$ with $\mat{\alpha} = \mat{\alpha}_1$ as the single parameters matrix. In this case the tensor MLE estimate for $\mat{\alpha}$ simplifies to its well known form
     \begin{align*}
-        \widehat{\mat{\alpha}}_1
-            &= \widehat{\mat{\Delta}}_1((\ten{X}-\widehat{\ten{\mu}})\times_{k\in[1]}{\widehat{\mat{\Delta}}}_k^{-1})_{(1)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1} \\
-            &= \widehat{\mat{\Delta}}_1\widehat{\mat{\Delta}}_1^{-1}((\ten{X}-\widehat{\ten{\mu}}))_{(1)}\t{\ten{F}_{(1)}}[\ten{F}_{(1)}\t{\ten{F}_{(1)}}]^{-1} \\
-            &= \t{\mat{X}}\mat{F}(\t{\mat{F}}\mat{F})^{-1}
+        \widehat{\mat{\alpha}}
+            &= (\ten{X}\times_{k\in\emptyset}{\widehat{\mat{\Delta}}}_k^{-1})_{(1)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(j)}}[(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\Delta}}_k^{-1}\widehat{\mat{\alpha}}_k)_{(j)}\t{(\ten{F}\times_{k\in\emptyset}\widehat{\mat{\alpha}}_k)_{(j)}}]^{-1} \\
+            &= \ten{X}_{(1)}\t{\ten{F}_{(1)}}(\ten{F}_{(1)}\t{\ten{F}_{(1)}})^{-1}
+             = \t{\mat{X}}\mat{F}(\t{\mat{F}}\mat{F})^{-1}
     \end{align*}
-    as well as the covariance estimate
+    and the estimate for the covariance $\mat{\Delta} = \mat{\Delta}_1$ is simply
     \begin{align*}
-        \widehat{\mat{\Delta}}_1
+        \widehat{\mat{\Delta}}
             &= \frac{1}{n}(\ten{R}\times_{k\in\emptyset}{\widehat{\mat{\Delta}}}_k^{-1})_{(1)}\t{\ten{R}_{(1)}} = \frac{1}{n}\t{\mat{R}}\mat{R}
                 = \frac{1}{n}\t{(\mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1})}(\mat{X} - \mat{F}\t{\widehat{\mat{\alpha}}_1}).
     \end{align*}
-    Note that $\ten{F}_{(1)} = \t{\mat{F}}$, $\ten{X}_{(1)} = \t{\mat{X}}$ and $\ten{R}_{(1)} = \t{\mat{R}}$.
 \end{example}
 
-\paragraph{Comparison to the general case:} {\color{red} There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.}
+\paragraph{Comparison to the general case:} \todo{There are two main differences, first the general case has a closed form solution for the gradient due to the explicit nature of the MLE estimate of $\widehat{\mat\Delta}$ compared to the mutually dependent MLE estimates $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$. On the other hand the general case has dramatically bigger dimensions of the covariance matrix ($p q \times p q$) compared to the two Kronecker components with dimensions $q \times q$ and $p \times p$. This means that in the general case there is a huge performance penalty in the dimensions of $\widehat{\mat\Delta}$ while in the other case an extra estimation is required to determine $\widehat{\mat\Delta}_1$, $\widehat{\mat\Delta}_2$.}
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                     Alternative covariance estimates                     %%%
+\section{Algorithms}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Alternative covariance estimates}
+A traight forward idea for parameter estimation is to use Gradient Descent. For pure algorithmic speedup, by only changin the update rule but \emph{not} the gradient computation of the objective function, we use Nesterov Accelerated Gradient Descent described in Section~\ref{sec:alg_gradient_descent}. An alternative approach applicable for all the methods is to resolve the cross dependence in the estimator equation systems by assuming all the other estimators to be fixed. This leads to an artificialy created closed form solution for the current estimate which is computed according the closed form solution. By cyclic iterating through all the parameters and iterating this process till convergence we get an alternative method as described in Section~\ref{sec:alg_iterative_updating}. In both cases initial estimates are needed for starting the iterative process which is the subject of Section~\ref{sec:alg_init}.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Nesterov Accelerated Gradient Descent}\label{sec:alg_gradient_descent}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+In Section~\ref{sec:kron_cov} we derived for different objective functions, meaning parameterized functions as minimization target, the gradients. In Section~\ref{sec:ls}  the objective function is the Frobenius norm of the estimated residuals for solving the Least Squares problem, then in Section~\ref{sec:mle} its the log-likelihood to find the MLE estimates and in Section~\ref{sec:approx} we had a pseduo log-likelihood. Regardles of which estimates we want to find, denote with $l$ the minimization objective corresponding to the desired minimization problem with parameters $\mat{\Theta}$ collecting all the parameters of the objective. The classic gradient descent algorithm starts with initial values $\mat{\Theta}^{(0)}$, see Section~\ref{sec:alg_init}, and applies gradient updates with a given learning rate $\delta > 0$ untill convergence. The algorithm used is an extention of the classic Gradient Descent algorithm namely Nesterov Accelerated Gradient Descent. This algorithm performs similar to Gradient Descent gradient updates but before evaluation of the gradient an extrapolation of the current position into the previous step direction is performed. Furthermore, an internal line search loop is used to determin an appropriate step size. See Algorithm~\ref{alg:gd} for the case of joint parameter matrices $\widehat{\mat{\alpha}}_1, ..., \widehat{\mat{\alpha}}_r$ and covariances $\widehat{\mat{\Delta}}_1, ..., \widehat{\mat{\Delta}}_r$ estimation. In case that the parameter matrices and the covariances are \emph{not} estimated together, like in the LS estimation, the parameter vector $\mat{\Theta}$ consists only of the parameter matrices $\mat{\alpha}_j$ (which is the only difference) and at the end of the algorithm the estimated parameter matrices can be used for estimation of the covariances.
+
+\todo{more details, better explanation, higher/lower abstraction with respect to the different methods?!}
+
+\begin{algorithm}[ht]
+    \caption{\label{alg:gd}Nesterov Accelerated Gradient Descent}
+    \begin{algorithmic}[1]
+        \State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$
+        \State Initialize: $\mat{\Theta}^{(0)} = (\mat{\alpha}^{(0)}_1, ..., \mat{\alpha}^{(0)}_r, \mat{\Delta}_1^{(0)}, ..., \mat{\Delta}_r^{(0)})$, $0 < c, \delta^{(1)}$ and $0 < \gamma < 1$ \Comment{See Section~\ref{sec:alg_init}}
+        \\
+        \State $t \leftarrow 1$
+            \Comment{step counter}
+        \State $\mat{\Theta}^{(1)} \leftarrow \mat{\Theta}^{(0)}$
+            \Comment{artificial first step}
+        \State $(m^{(0)}, m^{(1)}) \leftarrow (0, 1)$
+            \Comment{momentum extrapolation weights}
+        \\
+        \Repeat \Comment{repeat untill convergence}
+            \State $\mat{M} \leftarrow \mat{\Theta}^{(t)} + \frac{m^{(t - 1)} - 1}{m^{(t)}}(\mat{\Theta}^{(t)} - \mat{\Theta}^{(t - 1)})$ \Comment{momentum extrapolation}
+            \For{$\delta = \gamma^{-1}\delta^{(t)}, \delta^{(t)}, \gamma\delta^{(t)}, \gamma^2\delta^{(t)}, ...$} \Comment{Line Search}
+                \State $\mat{\Theta}_{\text{temp}} \leftarrow \mat{M} + \delta \nabla_{\mat{\Theta}} l(\mat{M})$
+                \If{$l(\mat{\Theta}_{\text{temp}}) \leq l(\mat{\Theta}^{(t - 1)}) - c \delta \|\nabla_{\mat{\Theta}} l(\mat{M})\|_F^2$} \Comment{Armijo Condition}
+                    \State $\mat{\Theta}^{(t + 1)} \leftarrow \mat{\Theta}_{\text{temp}}$
+                    \State $\delta^{(t + 1)} \leftarrow \delta$
+                    \Break
+                \EndIf
+            \EndFor
+            \State $m^{(t + 1)} \leftarrow \frac{1 + \sqrt{1 + (2 m^{(t)})^2}}{2}$ \Comment{update extrapolation weights}
+            \State $t \leftarrow t + 1$
+        \Until converged
+    \end{algorithmic}
+\end{algorithm}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Iterative Cyclic Updating}\label{sec:alg_iterative_updating}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+For both the LS and MLE estimates we have derived equation systems, namely \eqref{eq:ls_est_alphas}, \eqref{eq:mle_est_alphas} and \eqref{eq:mle_est_Deltas}, for the estimates which are cross dependent. The idea of Iterative Cyclic Updating is simply to take the cross dependent equations and assume the unknown quantities given from the previous iteration. Then the cross dependency reduces to a closed form solution depending on the known previous iterates. This iterative prodess is repeated till convergence, see Algorithm~\ref{alg:iterative_updating} for the case of LS.
+
+\begin{algorithm}[ht]
+    \caption{\label{alg:iterative_updating}Iterative Cyclic Updating for LS estimates}
+    \begin{algorithmic}[1]
+        \State Arguments: Order $r + 1$ tensors $\ten{X}$, $\ten{F}$
+        \State Initialize: $\mat{\alpha}^{(0)}_1, ..., \mat{\alpha}^{(0)}_r$ \Comment{See Section~\ref{sec:alg_init}}
+        \\
+        \State $t \leftarrow 0$
+        \Repeat\Comment{until convergence}
+            \For{$j = 1$ to $r$}\Comment{For each mode}
+                \State $\widehat{\mat{\alpha}}^{(t+1)}_j \leftarrow \ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}^{(t)}_k)_{(j)}}[(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}^{(t)}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}^{(t)}_k)_{(j)}}]^{-1}$\label{alg:iterative_updating:update} \\
+                \Comment{Solve for $\widehat{\mat{\alpha}}_j$ given all others}
+            \EndFor
+        \Until{$\sum_{j = 1, ..., r} \| \widehat{\mat{\alpha}}^{(t)}_j - \widehat{\mat{\alpha}}^{(t - 1)}_j \|_F^2 < \epsilon^2$}
+        \Comment{convergence condition (one example)}
+    \end{algorithmic}
+\end{algorithm}
+
+A refined version would be to always take the newest estimates. In the case of Algorithm~\ref{alg:iterative_updating} this means that when computing $\widehat{\mat{\alpha}}_j^{(t + 1)}$ we use $\widehat{\mat{\alpha}}_k^{(t + 1)}$ for $k = 1, ..., j - 1$ and $\widehat{\mat{\alpha}}_k^{(t)}$ for $k = j + 1, ..., r$ in line \ref{alg:iterative_updating:update} instead.
+
+Furthermore, there is also the idea of randomizing the updating order which seems improve convergence and kind of stabalizes the algorithm.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Initial Values}\label{sec:alg_init}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+Currently there are two approaches for the initial value estimates required by Algorithm~\ref{alg:gd} and Algorithm~\ref{alg:iterative_updating}. First there is the \emph{Van Loan and Pitsianis} (VLP) method shortly described in Section~\ref{sec:VLP}. The second ansatz is to perform an \emph{Higher Order Principal Component Analysis} (HOPCA) and take the required amount of eigvectors as initial values described in Section~\ref{sec:HOPCA}.
+
+\subsubsection{Van Loan and Pitsianis (VLP)}\label{sec:VLP}
+The VLP approach builds on an least squares solution on an rank 1 Kronecker product decomposition \cite{ApproxKron-VanLoanPitsianis1993}. To use the VLP decomposition we solve the vectorized version of model \ref{eq:sample_model} (assuming $\ten{\mu} = 0$)
+\begin{displaymath}
+    \ten{X}_{(r+1)} = \ten{F}_{(r+1)} \bigotimes_{j = r}^1 \t{\mat{\alpha}}_j + \ten{\epsilon}_{(r+1)}.
+\end{displaymath}
+Let $\mat{B} = \bigotimes_{j = r}^1 \t{\mat{\alpha}}_j$, then an estimate of $\mat{B}$ without considering the Kronecker structure is given by the usual least squres solution
+\begin{displaymath}
+    \widehat{\mat{B}} = \t{\ten{X}_{(r+1)}}\ten{F}_{(r+1)}(\t{\ten{F}_{(r+1)}}\ten{F}_{(r+1)})^{-1}
+\end{displaymath}
+Then approximate the $\widehat{\mat{B}}$ using the VLP rank 1 Kronecker decomposition (iteratively if rank $r > 2$, e.g. when there are more than two $\widehat{\mat{\alpha}}_j$'s) as
+\begin{displaymath}
+    \widehat{\mat{\alpha}}_1, ..., \widehat{\mat{\alpha}}_r = \argmin_{\mat{\alpha}_1, ..., \mat{\alpha}_r} \| \widehat{\mat{B}} - \bigotimes_{j = r}^1 \t{\mat{\alpha}}_j \|_F.
+\end{displaymath}
+
+\subsubsection{Higher Order Principal Component Analysis (HOPCA)}\label{sec:HOPCA}
+The \emph{Higher Order Principal Component Analysis} a simple estimation method for estimationg Principal Components for each mode of a dataset consisting of tensor valued observations as illustrated in Algorithm~\ref{alg:HOPCA}.
+
+\begin{algorithm}[ht]
+    \caption{\label{alg:HOPCA}Higher Order Principal Component Analysis}
+    \begin{algorithmic}[1]
+        \State Arguments: Order $r + 1$ tensor $\ten{X}$ of dimensions $p = (p_1, ..., p_r, n)$, $q = (q_1, ..., q_r)$ number of components for each mode
+        \State Returns: $\mat{A}_j$ PC matrices for each mode of dimensions $p_j\times q_j$, $j = 1, ..., r$.
+        \\
+        \For{$j = 1$ to $r$}\Comment{For each mode}
+            \State $\mat{A}_j \leftarrow$ the first $q_j$ eigenvectors of $\ten{X}_{(j)}\t{\ten{X}_{(j)}}$
+        \EndFor
+    \end{algorithmic}
+\end{algorithm}
+
+
+\subsection{Estimation Algorithms}
+By using initial values from Section~\ref{sec:alg_init} for initial values for the Gradiet Descent algorithm from Section~\ref{sec:alg_gradient_descent} or the Iterative Updating algorithm from Section~\ref{sec:alg_iterative_updating} the complete sequence of algorithm for estimation is combined as ilustrated in Figure~\ref{fig:algo_dependency}.
+
+\begin{figure}[!hp]
+    \centering
+    \begin{tikzpicture}[>=latex]
+        \node[draw, text centered, text width = 3cm, gray] (vlp) at (-2, 0) {VLP \\ Section~\ref{sec:alg_init}};
+        \node[draw, text centered, text width = 3cm] (hopca) at (2, 0) {HOPCA \\ Section~\ref{sec:alg_init}};
+        \node[draw, text centered, text width = 3cm] (ls) at (0, -1.5) {LS \\ Section~\ref{sec:ls},~\ref{sec:alg_gradient_descent},~\ref{sec:alg_iterative_updating}};
+        \node[draw, text centered, text width = 3cm] (mle) at (0, -3) {MLE \\ Section~\ref{sec:mle},~\ref{sec:alg_gradient_descent},~\ref{sec:alg_iterative_updating}};
+        % \node[draw, text centered, text width = 3cm] (approx) at (2, -3) {pseudo MLE \\ Section~\ref{sec:mle},~\ref{sec:alg_gradient_descent},\todo{}};
+        \draw[->, gray] (vlp) -- (ls);
+        \draw[->] (hopca) -- (ls);
+        \draw[->] (ls) -- (mle);
+        \draw[->, dotted] (vlp) |- (mle);
+        \draw[->, dotted] (hopca) |- (mle);
+        % \draw[->] (ls) -- (approx);
+        % \draw[->, dotted] (vlp) -- (mle);
+        % \draw[->, dotted] (hopca) -- (approx);
+        % \draw[dotted] ($(vlp.south) + (0, -0.25)$) -- ($(hopca.south) + (0, -0.25)$);
+    \end{tikzpicture}
+    \caption{\label{fig:algo_dependency}Algorithmic dependencies.}
+\end{figure}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Simulations}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Metrics}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+For measring the performance of the different methods in the simulations we employ different metrics to compare the estimates against the ``true'' parameters used to generate the simulation data. The parameters in the sample model \ref{eq:sample_model} are the parameter matrices $\widehat{\mat{\alpha}}_j$ and the covariance matrices $\widehat{\mat{\Delta}}_j$. Given the ``true'' parameters $\mat{\alpha}_j, \mat{\Delta}_j$, $j = 1, ..., r$ used to generate the simulation data samples $(\ten{X}_i, \ten{F}_i)$ for $i = 1, ..., n$ we employ a few metrics.
+
+First there is the \emph{maximum subspace distance} as the maximum over the $j = 1, ..., r$ modes of the Frobenius norms of the subspace projection matrix differences;
+\begin{displaymath}
+    d_{\mat{\alpha}}((\mat{\alpha}_1, ..., \mat{\alpha}_r), (\widehat{\mat{\alpha}}_1, ... ,\widehat{\mat{\alpha}}_r)) = \max_{j = 1, ..., r} \| \mat{P}_{\mat{\alpha}_j} - \mat{P}_{\widehat{\mat{\alpha}}_j} \|_F
+\end{displaymath}
+where $\mat{P}_{\mat{A}} = \mat{A}(\t{\mat{A}}\mat{A})^{-1}\t{\mat{A}}$ is the projection onto $\Span(\mat{A})$.
+
+Another interesting distance is between the Kronecker products of the parameter matrices $\mat{\beta} = \bigotimes_{j = r}^1 \mat{\alpha}_j$ and its estimate $\widehat{\mat{\beta}} = \bigotimes_{j = r}^1 \widehat{\mat{\alpha}}_j$. They correspond to the parameters of the vectorized model under the Kronecker product constraint. We use again the Frobenius norm of the projection differences
+\begin{displaymath}
+    d_{\mat{\beta}} = \|\mat{P}_{\mat{\beta}} - \mat{P}_{\widehat{\mat{\beta}}}\|_F.
+\end{displaymath}
+This might get very expensive to compute directly. But fortunetly this can be drastically simplified, in the sense of the size of the involved matrices. Therefore, we use some properties of projections as well as the Kronecker product which allow to rewrite
+\begin{align*}
+    \|\mat{P}_{\mat{\beta}} - \mat{P}_{\widehat{\mat{\beta}}}\|_F^2
+        &= \tr((\mat{P}_{\mat{\beta}} - \mat{P}_{\widehat{\mat{\beta}}})\t{(\mat{P}_{\mat{\beta}} - \mat{P}_{\widehat{\mat{\beta}}})}) \\
+        &= \tr(\mat{P}_{\mat{\beta}}) - 2\tr(\mat{P}_{\mat{\beta}}\mat{P}_{\widehat{\mat{\beta}}}) + \tr(\mat{P}_{\widehat{\mat{\beta}}}) \\
+        &= \prod_{j = 1}^r\tr(\mat{P}_{\mat{\alpha}_j}) - 2\prod_{j = 1}^r\tr(\mat{P}_{\mat{\alpha}_j}\mat{P}_{\widehat{\mat{\alpha}}_j}) + \prod_{j = 1}^r\tr(\mat{P}_{\widehat{\mat{\alpha}}_j}) \\
+        &= \prod_{j = 1}^r\rank(\mat{\alpha}_j) - 2\prod_{j = 1}^r\tr(\mat{P}_{\mat{\alpha}_j}\mat{P}_{\widehat{\mat{\alpha}}_j}) + \prod_{j = 1}^r\rank(\widehat{\mat{\alpha}}_j).
+\end{align*}
+This formulation allows for substantialy more efficient implementation which can lead to a drastic speedup in the simulations cause the computation of $d_{\mat{\beta}}$ is its raw form can take a significant amount of time compared to the estimation. In addition, the memory footprint is also reduced drastically.
+
+Finally, for validating the estimation quality of the covariances $\mat{\Delta}_j$ the Frobenius norm of the ``true'' covariance $\mat{\Delta} = \bigotimes_{j = r}^1 \mat{\Delta}_j$ to its estimate $\widehat{\mat{\Delta}} = \bigotimes_{j = r}^1 \widehat{\mat{\Delta}}_j$ is computed.
+\begin{displaymath}
+    d_{\mat{\Delta}}(\mat{\Delta}, \widehat{\mat{\Delta}}) = \|\mat{\Delta} - \widehat{\mat{\Delta}}\|_F.
+\end{displaymath}
+Again, this can be rewritten in an computationly easier form
+\begin{align*}
+    \|\mat{\Delta} - \widehat{\mat{\Delta}}\|_F^2
+        &= \tr(\mat{\Delta} - \widehat{\mat{\Delta}})\t{(\mat{\Delta} - \widehat{\mat{\Delta}})} \\
+        &= \tr\mat{\Delta}^2 - 2 \tr\mat{\Delta}\widehat{\mat{\Delta}} + \tr\widehat{\mat{\Delta}}^2 \\
+        &= \prod_{j = 1}^r\tr\mat{\Delta_j}^2 - 2 \prod_{j = 1}^r\tr\mat{\Delta}_j\widehat{\mat{\Delta}}_j + \prod_{j = 1}^r\tr\widehat{\mat{\Delta}}_j^2.
+\end{align*}
+There are also cases where the ``true'' covariance $\mat{\Delta}$ is \emph{not} a Kronecker product, in this situation the distance needs to be computed directly as $\|\mat{\Delta} - \widehat{\mat{\Delta}}\|_F$.
+
+Furthermore, we report the negative log-likelihood without the constraint term as \emph{loss} and the \emph{mean squared error} MSE
+\begin{displaymath}
+    \text{MSE} = \frac{1}{n p} \|\ten{R}\|_F^2.
+\end{displaymath}
+
+% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% \subsection{Simulations}
+% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+% In this section we present a few simulation studies.
+
+% % \subsubsection{Sim 1}
+% % For $n = 200$ observations, with 
+
+% % \begin{figure}
+% %     \centering
+% %     \includegraphics[width = 0.6\textwidth]{sim2.logs.png}
+% % \end{figure}
+
+% % The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
+% % \begin{displaymath}
+% %     \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
+% % \end{displaymath}
+% % where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.
+
+% \subsubsection{Sim 4 (multi-variable polynomial interactions)}
+% Lets considure a polynomial model with $r$ variables. Each variable by itself is described by a polynomial of order $p_j - 1$. Let $z_j$ be the variables for the $j$'th axis and let $\mat{\alpha}_j\in\mathbb{R}^{p_j}$, $j = 1, ..., r$ be the parameters for the polynomial of the $j$'th variable (note that the parameters $\mat{\alpha}_j$ are vectors treates by the estimation methods as $p_j\times 1$ matrices).
+% \begin{displaymath}
+%     y = \prod_{j = 1}^r \sum_{k = 1}^{p_j}\mat{\alpha}_{j, k} z_j^{k - 1}
+% \end{displaymath}
+% For the simulation we take a 3D problem ($r = 3$) and quadratic polynomials. For $n = 500$ samples the parameters $\mat{z} = (z_1, z_2, z_3)$ are samples uniformly from the $[-1, 1]^3$ cube as arguments to quadratic polynomials $p_1 = p_2 = p_3 = 3$. The $i$'th scalar response $y_i$ is then computed by above formula which can be written in a multi-linear setting as
+% \begin{displaymath}
+%     y_i = \ten{X}_i\times_{j\in[r]}\t{\mat{\alpha}_j} + \mathcal{N}(0, \sigma^2)
+% \end{displaymath}
+% where the order $r = 3$ tensor $\ten{X}$ consists of all interaction terms of the monoms $x_{j}^{k - 1}$ for $k = 1, ..., p_j$ for each axis $j = 1, ..., r$ ans $\sigma = 0.1$. More explicitly, the vectorized version satisfies
+% \begin{displaymath}
+%     \vec{\ten{X}} = \bigotimes_{j = r}^r \t{(1, z_j, z_j^2, ..., z_j^{p_j-1})}.
+% \end{displaymath}
+
+% This corresponds to the classic polynomial interaction model of the form
+% \begin{displaymath}
+%     y = \t{\vec(\ten{Z})}\mat{\beta} + \mathcal{N}(0, \sigma^2).
+% \end{displaymath}
+% where $\mat{\beta} = \bigotimes_{j = r}^1\mat{\alpha}_j$.
+
+\subsection{EEG Data}
+As an real world example we compair the HO-PIR method against two reference methods, namely LSIR \cite{lsir-PfeifferForzaniBura} and K-PIR \cite{sdr-PfeifferKaplaBura2021}, on the EEG data set. This is a small study including $77$ alcoholic individuals and $45$ control subjects (see: \url{http://kdd.ics.uci.edu/databases/eeg/eeg.data.html}). The data for each subject consisted of a $64\times 256$ matrix, with each column representing a time point and each row a channel. The data were obtained by exposing each individual to visual stimuli and measuring $7$ Simulations voltage values from $64$ electrodes placed on the subjects scalps sampled at $256$ time points (at $256$ Hz for $1$ second). Each individual observation is the mean over $120$ different trial per subject. We first preprocess the data using the HO-PCA Algorithm~\ref{alg:HOPCA} with different PCA's per mode. The results are listed in Table~\ref{tab:eeg_sim}.
+
+
+\begin{table}[!ht]
+    \centering
+    \begin{tabular}{*{2}{l} | *{7}{c}}
+        method       &      npc  & Acc  & AUC (sd)    \\
+        \hline
+        K-PIR (mle)  &    (3, 4) & 0.70 & 0.75 (0.05) \\
+        LSIR         &    (3, 4) & 0.80 & 0.85 (0.04) \\
+        HO-PIR (ls)  &    (3, 4) & 0.80 & 0.85 (0.04) \\ % Algorithm: ICU
+        % HO-PIR (ls,nagd) &    (3, 4) & 0.80 & 0.85 (0.04) \\
+        HO-PIR (mle) &    (3, 4) & 0.80 & 0.85 (0.04) \\ % Algorithm: ICU
+        % HO-PIR (mle,nagd)&    (3, 4) & 0.80 & 0.85 (0.04) \\
+        \hline
+        K-PIR (mle)      &  (15, 15) &  --- & 0.78 (0.04) \\
+        LSIR             &  (15, 15) & 0.72 & 0.81 (0.04) \\
+        HO-PIR (ls)  &  (15, 15) & 0.79 & 0.83 (0.04) \\ % Algorithm: ICU
+        % HO-PIR (ls,nagd) &  (15, 15) & 0.79 & 0.83 (0.04) \\
+        HO-PIR (mle) &  (15, 15) & 0.77 & 0.83 (0.04) \\ % Algorithm: ICU
+        % HO-PIR (mle,nagd)&  (15, 15) & 0.76 & 0.81 (0.04) \\
+        \hline
+        K-PIR (mle)  &  (20, 30) &  --- & 0.78 (0.04) \\
+        LSIR         &  (20, 30) & 0.79 & 0.83 (0.04) \\
+        HO-PIR (ls)  &  (20, 30) & 0.75 & 0.80 (0.05) \\ % Algorithm: ICU
+        % HO-PIR (ls,nagd) &  (20, 30) & 0.75 & 0.80 (0.05) \\
+        HO-PIR (mle) &  (20, 30) & 0.75 & 0.83 (0.04) % \\ % Algorithm: ICU
+        % HO-PIR (mle,nagd)&  (20, 30) & 0.75 & 0.80 (0.05)% \\
+        % %HOPIR(ls,icu)   & (256, 64) & 0.67 & 0.69 (0.05) \\
+    \end{tabular}
+    \caption{\label{tab:eeg_sim}Recreation of the EEG data LOO-CV from \cite{sdr-PfeifferKaplaBura2021} Section~7. The methods HO-PIR (ls) and HO-PIR (mle) use the ICU Algorithm~\ref{alg:iterative_updating}.}
+\end{table}
+
+% % Same as above, but with more content
+% \begin{table}[!ht]
+%     \centering
+%     \begin{tabular}{*{2}{l} | *{7}{c}}
+%         method  &      npc  & Acc  & Err  & FPR  & TPR  & FNR  & TNR  & AUC (sd)    \\
+%         \hline
+%         K-PIR (mle)      &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.05) \\
+%         LSIR             &    (3, 4) & 0.80 & 0.20 & 0.36 & 0.88 & 0.12 & 0.64 & 0.85 (0.04) \\
+%         HO-PIR (ls,icu)  &    (3, 4) & 0.80 & 0.20 & 0.33 & 0.87 & 0.13 & 0.67 & 0.85 (0.04) \\
+%         HO-PIR (ls,nagd) &    (3, 4) & 0.80 & 0.20 & 0.33 & 0.87 & 0.13 & 0.67 & 0.85 (0.04) \\
+%         HO-PIR (mle,icu) &    (3, 4) & 0.80 & 0.20 & 0.36 & 0.88 & 0.12 & 0.64 & 0.85 (0.04) \\
+%         HO-PIR (mle,nagd)&    (3, 4) & 0.80 & 0.20 & 0.33 & 0.87 & 0.13 & 0.67 & 0.85 (0.04) \\
+%         \hline
+%         K-PIR (mle)      &  (15, 15) &  --- &  --- &  --- &  --- &  --- &  --- & 0.78 (0.04) \\
+%         LSIR             &  (15, 15) & 0.72 & 0.28 & 0.44 & 0.82 & 0.18 & 0.56 & 0.81 (0.04) \\
+%         HO-PIR (ls,icu)  &  (15, 15) & 0.79 & 0.21 & 0.38 & 0.88 & 0.12 & 0.62 & 0.83 (0.04) \\
+%         HO-PIR (ls,nagd) &  (15, 15) & 0.79 & 0.21 & 0.38 & 0.88 & 0.12 & 0.62 & 0.83 (0.04) \\
+%         HO-PIR (mle,icu) &  (15, 15) & 0.77 & 0.23 & 0.40 & 0.87 & 0.13 & 0.60 & 0.83 (0.04) \\
+%         HO-PIR (mle,nagd)&  (15, 15) & 0.76 & 0.24 & 0.47 & 0.90 & 0.10 & 0.53 & 0.81 (0.04) \\
+%         \hline
+%         K-PIR (mle)      &  (20, 30) &  --- &  --- &  --- &  --- &  --- &  --- & 0.78 (0.04) \\
+%         LSIR             &  (20, 30) & 0.79 & 0.21 & 0.36 & 0.87 & 0.13 & 0.64 & 0.83 (0.04) \\
+%         HO-PIR (ls,icu)  &  (20, 30) & 0.75 & 0.25 & 0.38 & 0.83 & 0.17 & 0.62 & 0.80 (0.05) \\
+%         HO-PIR (ls,nagd) &  (20, 30) & 0.75 & 0.25 & 0.38 & 0.83 & 0.17 & 0.62 & 0.80 (0.05) \\
+%         HO-PIR (mle,icu) &  (20, 30) & 0.75 & 0.25 & 0.40 & 0.83 & 0.17 & 0.60 & 0.83 (0.04) \\
+%         HO-PIR (mle,nagd)&  (20, 30) & 0.75 & 0.25 & 0.42 & 0.86 & 0.14 & 0.58 & 0.80 (0.05)% \\
+%         %HOPIR(ls,icu)   & (256, 64) & 0.67 & 0.33 & 0.53 & 0.79 & 0.21 & 0.47 & 0.69 (0.05) \\
+%     \end{tabular}
+%     \caption{\label{tab:eeg_sim}Recreation of the EEG data LOO-CV from \cite{sdr-PfeifferKaplaBura2021} Section~7.}
+% \end{table}
+
+
+% \begin{table}[!ht]
+%     \centering
+%     % see: https://en.wikibooks.org/wiki/LaTeX/Tables
+%     \begin{tabular}{ll|r@{ }l *{3}{r@{.}l}}
+%           method & init
+%             & \multicolumn{2}{c}{loss}
+%             & \multicolumn{2}{c}{MSE}
+%             & \multicolumn{2}{c}{$\dist(\hat{\mat\alpha}, \mat\alpha)$}
+%             & \multicolumn{2}{c}{$\dist(\hat{\mat\beta}, \mat\beta)$}
+%             \\ \hline
+%             K-PIR (mle) &      & -2642&(1594) & 1&82  (2.714) & 0&248 (0.447) & 0&271 (0.458) \\ % base
+%             new         &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
+%             new         &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
+%             momentum    &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
+%             momentum    &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
+%             approx      &  vlp &  6819&(1995) & 3&99 (12.256) & 0&267 (0.448) & 0&287 (0.457) \\
+%             approx      &   ls &  5457& (163) & 0&99  (0.025) & 0&033 (0.017) & 0&030 (0.012) \\
+%         \end{tabular}
+%     \caption{Mean (standard deviation) for simulated runs of $20$ repititions for the model $\mat{X} = \mat{\beta}\mat{f}_y\t{\mat{\alpha}}$ of dimensions $(p_1, p_2) = (11, 7)$, $(q_1, q_2) = (3, 5)$ with a sample size of $n = 200$. The covariance structure is $\mat{\Delta} = \mat{\Delta}_2\otimes \mat{\Delta}_1$ for $\Delta_i = \text{AR}(\sqrt{0.5})$, $i = 1, 2$. The functions applied to the standard normal response $y$ are $\sin, \cos$ with increasing frequency.}
+% \end{table}
+
+% \begin{figure}
+%     \centering
+%     \includegraphics{loss_Ex01.png}
+% \end{figure}
+% \begin{figure}
+%     \centering
+%     \includegraphics{estimates_Ex01.png}
+% \end{figure}
+% \begin{figure}
+%     \centering
+%     \includegraphics{Delta_Ex01.png}
+% \end{figure}
+% \begin{figure}
+%     \centering
+%     \includegraphics{hist_Ex01.png}
+% \end{figure}
+
+% > print(times, digits = 2)
+%            method       npc elapsed sys.self user.self
+% 1    hopir.ls.icu    (3, 4)   0.003    0.006     0.005
+% 3   hopir.ls.nagd    (3, 4)   0.023    0.000     0.023
+% 2   hopir.mle.icu    (3, 4)   0.020    0.044     0.038
+% 4  hopir.mle.nagd    (3, 4)   0.062    0.125     0.109
+
+% 5    hopir.ls.icu  (15, 15)   0.012    0.061     0.037
+% 7   hopir.ls.nagd  (15, 15)   0.089    0.009     0.082
+% 6   hopir.mle.icu  (15, 15)   0.176    0.860     0.503
+% 8  hopir.mle.nagd  (15, 15)   0.212    0.657     0.478
+% 9    hopir.ls.icu  (20, 30)   0.028    0.142     0.079
+% 11  hopir.ls.nagd  (20, 30)   0.302    0.269     0.347
+% 10  hopir.mle.icu  (20, 30)   0.429    2.043     1.350
+% 12 hopir.mle.nagd  (20, 30)   0.461    0.958     0.828
+
+% 13   hopir.ls.icu (256, 64)   0.946    3.574     2.840
+
+% \begin{table}[!ht]
+%     \centering
+%     \begin{tabular}{*{3}{l} | *{7}{c}}
+%         method      & init &      npc  &  Acc &  Err &  FPR &  TPR &  FNR &  TNR &  AUC (sd) \\ \hline
+%         K-PIR (mle) &      &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.047) \\ % base
+%         new         &  vlp &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.047) \\
+%         new         &   ls &    (3, 4) & 0.74 & 0.26 & 0.51 & 0.88 & 0.12 & 0.49 & 0.77 (0.045) \\
+%         ls          &      &    (3, 4) & 0.75 & 0.25 & 0.49 & 0.88 & 0.12 & 0.51 & 0.78 (0.044) \\
+%         ls$^*$      &      &    (3, 4) & 0.78 & 0.22 & 0.38 & 0.87 & 0.13 & 0.62 & 0.86 (0.034) \\ %
+%         LSIR        &      &    (3, 4) & 0.80 & 0.20 & 0.36 & 0.88 & 0.12 & 0.64 & 0.85 (0.036) \\
+%         momentum    &  vlp &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.047) \\
+%         momentum    &   ls &    (3, 4) & 0.70 & 0.30 & 0.58 & 0.87 & 0.13 & 0.42 & 0.76 (0.046) \\
+%         approx      &  vlp &    (3, 4) & 0.68 & 0.32 & 0.62 & 0.86 & 0.14 & 0.38 & 0.74 (0.048) \\
+%         approx      &   ls &    (3, 4) & 0.73 & 0.27 & 0.53 & 0.88 & 0.12 & 0.47 & 0.78 (0.044) \\ \hline
+%         ls          &      &  (15, 15) & 0.75 & 0.25 & 0.47 & 0.87 & 0.13 & 0.53 & 0.78 (0.044) \\
+%         ls$^*$      &      &  (15, 15) & 0.76 & 0.24 & 0.44 & 0.88 & 0.12 & 0.56 & 0.83 (0.039) \\ %
+%         LSIR        &      &  (15, 15) & 0.72 & 0.28 & 0.44 & 0.82 & 0.18 & 0.56 & 0.81 (0.040) \\
+%         approx      &   ls &  (15, 15) & 0.73 & 0.27 & 0.51 & 0.87 & 0.13 & 0.49 & 0.78 (0.044) \\ \hline
+%         ls          &      &  (20, 30) & 0.75 & 0.25 & 0.47 & 0.87 & 0.13 & 0.53 & 0.78 (0.044) \\
+%         ls$^*$      &      &  (20, 30) & 0.77 & 0.23 & 0.36 & 0.84 & 0.16 & 0.64 & 0.79 (0.045) \\ %
+%         LSIR        &      &  (20, 30) & 0.79 & 0.21 & 0.36 & 0.87 & 0.13 & 0.64 & 0.83 (0.038) \\
+%         approx      &   ls &  (20, 30) & 0.63 & 0.37 & 1.00 & 1.00 & 0.00 & 0.00 & 0.51 (0.053) \\ \hline
+%         HOPCA       &      &           & 0.62 & 0.38 & 1    & 0.99 & 0.01 & 0    & 0.56 (0.053) \\
+%         ls          &      &           & 0.75 & 0.25 & 0.44 & 0.87 & 0.13 & 0.56 & 0.78 (0.044) \\
+%         ls$^*$      &      &           & 0.68 & 0.32 & 0.51 & 0.79 & 0.21 & 0.49 & 0.66 (0.054) \\ %
+%         approx      &   ls &           & 0.75 & 0.25 & 0.44 & 0.87 & 0.13 & 0.56 & 0.78 (0.044) \\
+%     \end{tabular}
+%     \caption{\label{tab:eeg_sim}Recreation of the EEG data LOO-CV from \cite{sdr-PfeifferKaplaBura2021} Section~7, EEG Data and Table~6 with new methods. Column \emph{vlp} stands for the Van Loan and Pitsianis initialization scheme as described in \cite{sdr-PfeifferKaplaBura2021} and \emph{ls} is the approach described above. The column \emph{npc} gives the number of principal component of the $(2D)^2 PCA$ preprocessing. Reduction by $\ten{X}\times_{j\in[r]}\t{\widehat{\mat{\alpha}}_j}$ instread of $^*$ where reduction is $\ten{X}\times_{j\in[r]}\t{\widehat{\mat{\Delta}}_j^{-1}\widehat{\mat{\alpha}}_j}$.}
+% \end{table}
+
+% \begin{table}
+%     \centering
+%     \begin{tabular}{*{3}{l} | *{3}{r@{.}l}}
+%         method   & init &       npc
+%             & \multicolumn{2}{c}{elapsed}
+%             & \multicolumn{2}{c}{sys.self}
+%             & \multicolumn{2}{c}{user.self} \\ \hline
+%         base     &      &    (3, 4) &  0&079 &   0&402 &   0&220 \\
+%         new      &  vlp &    (3, 4) &  0&075 &   0&393 &   0&217 \\
+%         new      &   ls &    (3, 4) &  0&218 &   0&243 &   0&305 \\
+%         ls       &      &    (3, 4) &  0&003 &   0&006 &   0&006 \\
+%         LSIR     &      &    (3, 4) &  0&002 &   0&000 &   0&002 \\
+%         momentum &  vlp &    (3, 4) &  0&143 &   0&595 &   0&359 \\
+%         momentum &   ls &    (3, 4) &  0&297 &   0&252 &   0&385 \\
+%         approx   &  vlp &    (3, 4) &  0&044 &   0&240 &   0&152 \\
+%         approx   &   ls &    (3, 4) &  0&066 &   0&144 &   0&121 \\ \hline
+%         ls       &      &  (15, 15) &  0&012 &   0&059 &   0&034 \\
+%         LSIR     &      &  (15, 15) &  0&010 &   0&050 &   0&031 \\
+%         approx   &   ls &  (15, 15) &  0&813 &   3&911 &   2&325 \\ \hline
+%         ls       &      &  (20, 30) &  0&028 &   0&129 &   0&080 \\
+%         LSIR     &      &  (20, 30) &  0&030 &   0&142 &   0&100 \\
+%         approx   &   ls &  (20, 30) &  2&110 &  10&111 &   6&290 \\ \hline
+%         HOPCA    &      &           &  0&24  &   0&37  &   0&43  \\
+%         ls       &      &           &  1&252 &   6&215 &   3&681 \\
+%         approx   &   ls &           & 36&754 & 141&028 & 147&490 \\
+%     \end{tabular}
+%     \caption{\label{tab:eeg_time}Like Table~\ref{tab:eeg_sim} but reports the mean run-time.}
+% \end{table}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%                              Bib and Index                               %%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% \printindex
+\nocite{*}
+\printbibliography[heading=bibintoc,title={References}]
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%                                 Appendix                                 %%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newpage
+
+\appendix
+\section{Matrix Differential Rules}
+Let $\mat A$ be a square matrix (and invertible if needed) and $|.|$ stands for the determinant
+\begin{align*}
+    \d\log\mat A &= \frac{1}{|\mat A|}\d\mat{A} \\
+    \d|\mat A| &= |\mat A|\tr \mat{A}^{-1}\d\mat A \\
+    \d\log|\mat A| &= \tr\mat{A}^{-1}\d\mat A \\
+    \d\mat{X}^{-1} &= -\mat{X}^{-1}(\d\mat{X})\mat{X}^{-1}
+\end{align*}
+
+\section{Useful Matrix Identities}
+In this section we summarize a few useful matrix identities, for more details see for example \cite{MatrixAlgebra-AbadirMagnus2005}.
+
+For two matrices $\mat A$ of dimensions $a_1\times a_2$ and $\mat B$ of dimensions $b_1\times b_2$ holds
+\begin{equation}
+    \mat{K}_{b_1, a_1}(\mat{A}\otimes\mat{B})\mat{K}_{a_2, b_2} = \mat{B}\otimes\mat{A}
+\end{equation}
+as well as
+\begin{equation}\label{eq:vecKron}
+    \vec(\mat A\kron\mat B) = (\mat{I}_{a_2}\kron\mat{K}_{b_2,a_1}\kron\mat{I}_{b_1})(\vec\mat A\kron\vec\mat B).
+\end{equation}
+
+Let $\mat A$ be a $p\times p$ dimensional non-singular matrix. Furthermore, let $\mat a, \mat b$ be $p$ vectors such that $\t{\mat b}A^{-1}\mat a\neq -1$, then
+\begin{displaymath}
+    (\mat A + \mat a\t{\mat b})^{-1} = \mat{A}^{-1} - \frac{1}{1 + \t{\mat b}A^{-1}\mat a}\mat{A}^{-1}\mat{a}\t{\mat{b}}\mat{A}^{-1}
+\end{displaymath}
+as well as
+\begin{displaymath}
+    \det(\mat A + \mat a\t{\mat b}) = \det(\mat A)(1 + \t{\mat b}{\mat A}^{-1}\mat a)
+\end{displaymath}
+which even holds in the case $\t{\mat b}\mat{A}^{-1}\mat a = -1$. This is known as Sylvester's determinant theorem.
+
+
+\section{Commutation Matrix and Permutation Identities}
+\begin{center}
+    Note: In this section we use 0-indexing for the sake of simplicity!
+\end{center}
+In this section we summarize relations between the commutation matrix and corresponding permutation. We also list some extensions to ``simplify'' or represent some term. This is mostly intended for implementation purposes and understanding of terms occurring in the computations.
+
+Let $\mat A$ be an arbitrary $a_1\times a_2$ matrix. The permutation matrix $\mat K_{a_1, a_2}$ satisfies
+\begin{displaymath}
+    \mat{K}_{a_1, a_2}\vec{\mat{A}} = \vec{\t{\mat{A}}} \quad\Leftrightarrow\quad (\vec{\mat{A}})_{\pi_{p, q}(i)} = (\vec{\t{\mat{A}}})_{i}, \quad\text{for } i = 0, ..., p q - 1
+\end{displaymath}
+where $\pi_{p, q}$ is a permutation of the indices $i = 0, ..., p q - 1$ such that
+\begin{displaymath}
+    \pi_{p, q}(i + j p) = j + i q, \quad\text{for }i = 0, ..., p - 1; j = 0, ..., q - 1.
+\end{displaymath}
+
+\begin{table}[!htp]
+    \centering
+    \begin{minipage}{0.8\textwidth}
+        \centering
+        \begin{tabular}{l c l}
+            $\mat{K}_{p, q}$                 & $\hat{=}$ & $\pi_{p, q}(i + j p) = j + i q$ \\
+            $\mat{I}_r\kron\mat{K}_{p, q}$ & $\hat{=}$ & $\tilde{\pi}_{p, q, r}(i + j p + k p q) = j + i q + k p q$ \\
+            $\mat{K}_{p, q}\kron\mat{I}_r$ & $\hat{=}$ & $\hat{\pi}_{p, q, r}(i + j p + k p q) = r(j + i q) + k$
+        \end{tabular}
+    \caption{\label{tab:commutation-permutation}Commutation matrix terms and corresponding permutations. Indices are all 0-indexed with the ranges; $i = 0, ..., p - 1$, $j = 0, ..., q - 1$ and $k = 0, ..., r - 1$.}
+    \end{minipage}
+\end{table}
+
+
+
+\section{Matrix and Tensor Operations}
+
+The \emph{Kronecker product}\index{Operations!Kronecker@$\kron$ Kronecker product} is denoted as $\kron$ and the \emph{Hadamard product} uses the symbol $\circ$. We also need the \emph{Khatri-Rao product}\index{Operations!KhatriRao@$\hada$ Khatri-Rao product}
+$\hada$ as well as the \emph{Transposed Khatri-Rao product} $\odot_t$ (or \emph{Face-Splitting product}). There is also the \emph{$n$-mode Tensor Matrix Product}\index{Operations!ttm@$\ttm[n]$ $n$-mode tensor product} denoted by $\ttm[n]$ in conjunction with the \emph{$n$-mode Matricization} of a Tensor $\mat{T}$ written as $\mat{T}_{(n)}$, which is a matrix. See below for definitions and examples of these operations.\todo{ Definitions and Examples}
+
+\todo{ resolve confusion between Khatri-Rao, Column-wise Kronecker / Khatri-Rao, Row-wise Kronecker / Khatri-Rao, Face-Splitting Product, .... Yes, its a mess.}
+\paragraph{Kronecker Product $\kron$:}
+\paragraph{Khatri-Rao Product $\hada$:}
+\paragraph{Transposed Khatri-Rao Product $\odot_t$:} This is also known as the Face-Splitting Product and is the row-wise Kronecker product of two matrices. If relates to the Column-wise Kronecker Product through
+\begin{displaymath}
+    \t{(\mat{A}\odot_{t}\mat{B})} = \t{\mat{A}}\hada\t{\mat{B}}
+\end{displaymath}
+
+\paragraph{$n$-mode unfolding:} \emph{Unfolding}, also known as \emph{flattening} or \emph{matricization}, is an reshaping of a tensor into a matrix with rearrangement of the elements such that mode $n$ corresponds to columns of the result matrix and all other modes are vectorized in the rows. Let $\ten{T}$ be a tensor of order $m$ with dimensions $t_1\times ... \times t_n\times ... \times t_m$ and elements indexed by $(i_1, ..., i_n, ..., i_m)$. The $n$-mode flattening, denoted $\ten{T}_{(n)}$, is defined as a $(t_n, \prod_{k\neq n}t_k)$ matrix with element indices $(i_n, j)$ such that $j = \sum_{k = 1, k\neq n}^m i_k\prod_{l = 1, l\neq n}^{k - 1}t_l$.
+\todo{ give an example!}
+
+\paragraph{$n$-mode Tensor Product $\ttm[n]$:}
+The \emph{$n$-mode tensor product} $\ttm[n]$ between a tensor $\mat{T}$ of order $m$ with dimensions $t_1\times t_2\times ... \times t_n\times ... \times t_m$ and a $p\times t_n$ matrix $\mat{M}$ is defined element-wise as
+\begin{displaymath}
+    (\ten{T}\ttm[n] \mat{M})_{i_1, ..., i_{n-1}, j, i_{n+1}, ..., i_m} = \sum_{k = 1}^{t_n} \ten{T}_{i_1, ..., i_{n-1}, k, i_{n+1}, ..., i_m} \mat{M}_{j, k}
+\end{displaymath}
+where $i_1, ..., i_{n-1}, i_{n+1}, ..., i_m$ run from $1$ to $t_1, ..., t_{n-1}, t_{n+1}, ..., t_m$, respectively. Furthermore, the $n$-th fiber index $j$ of the product ranges from $1$ to $p$. This gives a new tensor $\mat{T}\ttm[n]\mat{M}$ of order $m$ with dimensions $t_1\times t_2\times ... \times p\times ... \times t_m$.
+
+\begin{example}[Matrix Multiplication Analogs]
+    Let $\mat{A}$, $\mat{B}$ be two matrices with dimensions $t_1\times t_2$ and $p\times q$, respectively. Then $\mat{A}$ is also a tensor of order $2$, now the $1$-mode and $2$-mode products are element wise given by
+    \begin{align*}
+        (\mat{A}\ttm[1] \mat{B})_{i,j} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j}\mat{B}_{i,l}
+            = (\mat{B}\mat{A})_{i,j}
+                                        & \text{for }t_1 = q, \\
+        (\mat{A}\ttm[2] \mat{B})_{i,j} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l}\mat{B}_{j,l}
+            = (\mat{A}\t{\mat{B}})_{i,j} = \t{(\mat{B}\t{\mat{A}})}_{i,j}
+                                        & \text{for }t_2 = q.
+    \end{align*}
+    In other words, the $1$-mode product equals $\mat{A}\ttm[1] \mat{B} = \mat{B}\mat{A}$ and the $2$-mode is $\mat{A}\ttm[2] \mat{B} = \t{(\mat{B}\t{\mat{A}})}$ in the case of the tensor $\mat{A}$ being a matrix.
+\end{example}
+
+\begin{example}[Order Three Analogs]
+    Let $\mat{A}$ be a tensor of the form $t_1\times t_2\times t_3$ and $\mat{B}$ a matrix of dimensions $p\times q$, then the $n$-mode products have the following look
+    \begin{align*}
+        (\mat{A}\ttm[1]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j,k}\mat{B}_{i,l} & \text{for }t_1 = q, \\
+        (\mat{A}\ttm[2]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l,k}\mat{B}_{j,l} \equiv (\mat{B}\mat{A}_{i,:,:})_{j,k} & \text{for }t_2 = q, \\
+        (\mat{A}\ttm[3]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_3} \mat{A}_{i,j,l}\mat{B}_{k,l} \equiv \t{(\mat{B}\t{\mat{A}_{i,:,:}})}_{j,k} & \text{for }t_3 = q.
+    \end{align*}
+\end{example}
+
+Letting $\ten{F}$ be the $3$-tensor of dimensions $n\times k\times r$ such that $\ten{F}_{i,:,:} = \mat{f}_{y_i}$, then
+\begin{displaymath}
+    \mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}} = (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{i,:,:}
+\end{displaymath}
+or in other words, the $i$-th slice of the tensor product $\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha}$ contains $\mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}}$ for $i = 1, ..., n$.
+Another analog way of writing this is
+\begin{displaymath}
+    (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{(1)} = \mathbb{F}_{y}(\t{\mat{\alpha}}\kron\t{\mat{\beta}})
+\end{displaymath}
+
+\section{Equivalencies}
+In this section we give a short summary of alternative but equivalent operations.
+Using the notation $\widehat{=}$ to indicate that two expressions are identical in the sense that they contain the same element in the same order but may have different dimensions. Meaning, when vectorizing ether side of $\widehat{=}$, they are equal ($\mat{A}\widehat{=}\mat{B}\ :\Leftrightarrow\ \vec{\mat{A}} = \vec{\mat{B}}$).
+
+Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrices. 3-Tensors are written as $\ten{A}, \ten{B}, \ten{T}, \ten{X}, \ten{F}, \ten{R}, ...$.
+
+\begin{align*}
+    \ten{T}\ttm[3]\mat{A}\ &{\widehat{=}}\ \mat{T}\t{\mat A} & \ten{T}(n, p, q)\ \widehat{=}\ \mat{T}(n p, q), \mat{A}(p, q) \\
+    \ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p)
+\end{align*}
+
+% \section{Matrix Valued Normal Distribution}
+% A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted 
+% \begin{displaymath}
+%     \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
+% \end{displaymath}
+% if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
+% \begin{displaymath}
+%     f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
+% \end{displaymath}
+
+% \section{Sampling form a Multi-Array Normal Distribution}
+% Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed
+% \begin{displaymath}
+%     \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
+% \end{displaymath}
+% Its density is given by
+% \begin{displaymath}
+%     f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
+%         \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
+% \end{displaymath}
+% with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
+% \begin{displaymath}
+%     \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
+% \end{displaymath}
+% with $p = \prod_{i = 1}^r p_i$.
+
+% \todo{Check this!!!}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Alternative covariance estimates}\label{sec:approx}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
 An alternative approach is \emph{not} to use the MLE estimates for $\mat\Delta_1$, $\mat\Delta_2$ but (up to scaling) unbiased estimates.
 \begin{displaymath}
     \widetilde{\mat\Delta}_1 = \frac{1}{n}\sum_{i = 1}^n \t{\mat{R}_i}\mat{R}_i {\color{gray}\quad(q\times q)},\qquad
@@ -535,323 +1161,6 @@ and therefore the gradients
 
 \newpage
 
-\section{Thoughts on initial value estimation}
-\todo{This section uses an alternative notation as it already tries to generalize to general multi-dimensional arrays. Furthermore, one of the main differences is that the observation are indexed in the \emph{last} mode. The benefit of this is that the mode product and parameter matrix indices match not only in the population model but also in sample versions.}
-Let $\ten{X}_i, \ten{F}_i$ be order $r$ tensors of dimensions $p_1\times ... \times p_r$ and $q_1\times ... \times q_r$, respectively. Also denote the error tensor $\epsilon$ of the same order and dimensions as $\ten{X}_i$. The considered model for the $i$'th observation is
-\begin{displaymath}
-    \ten{X}_i = \ten{\mu} + \ten{F}_i\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r \} + \ten{\epsilon}_i
-\end{displaymath}
-where we assume $\ten{\epsilon}_i$ to be i.i.d. mean zero tensor normal distributed $\ten{\epsilon}_i\sim\mathcal{TN}(0, \mat{\Delta}_1, ..., \mat{\Delta}_r)$ for $\mat{\Delta}_j\in\mathcal{S}^{p_j}_{++}$, $j = 1, ..., r$. Given $i = 1, ..., n$ observations the collected model containing all observations
-\begin{displaymath}
-    \ten{X} = \ten{\mu} + \ten{F}\times\{ \mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n \} + \ten{\epsilon}
-\end{displaymath}
-which is almost identical as the observations $\ten{X}_i, \ten{F}_i$ are stacked on an addition $r + 1$ mode leading to response, predictor and error tensors $\ten{X}, \ten{F}$ of order $r + 1$ and dimensions $p_1\times...\times p_r\times n$ for $\ten{X}, \ten{\epsilon}$ and $q_1\times...\times q_r\times n$ for $\ten{F}$.
-
-In the following we assume w.l.o.g that $\ten{\mu} = 0$, as if this is not true we simply replace $\ten{X}_i$ with $\ten{X}_i - \ten{\mu}$ for $i = 1, ..., n$ before collecting all the observations in the response tensor $\ten{X}$.
-
-The goal here is to find reasonable estimates for $\mat{\alpha}_j$, $j = 1, ..., n$ for the mean model
-\begin{displaymath}
-    \E \ten{X}|\ten{F}, \mat{\alpha}_1, ..., \mat{\alpha}_r = \ten{F}\times\{\mat{\alpha}_1, ..., \mat{\alpha}_r, \mat{I}_n\}
-        = \ten{F}\times_{j\in[r]}\mat{\alpha}_j.
-\end{displaymath}
-Under the mean model we have using the general mode product relation $(\ten{A}\times_j\mat{B})_{(j)} = \mat{B}\ten{A}_{(j)}$ we get
-\begin{align*}
-    \ten{X}_{(j)}\t{\ten{X}_{(j)}} \overset{\text{SVD}}{=} \mat{U}_j\mat{D}_j\t{\mat{U}_j}
-        = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
-          \t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}\t{\mat{\alpha}_j}
-\end{align*}
-for the $j = 1, ..., r$ modes. Using this relation we construct an iterative estimation process by setting the initial estimates of $\hat{\mat{\alpha}}_j^{(0)} = \mat{U}_j[, 1:q_j]$ which are the first $q_j$ columns of $\mat{U}_j$.
-
-For getting least squares estimates for $\mat{\alpha}_j$, $j = 1, ..., r$ we observe that by matricization of the mean model
-\begin{displaymath}
-    \ten{X}_{(j)} = (\ten{F}\times_{k\in[r]}\mat{\alpha}_k)_{(j)} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}
-\end{displaymath}
-leads to normal equations for each $\mat{\alpha}_j$, $j = 1, ..., r$
-\begin{displaymath}
-    \ten{X}_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}} = \mat{\alpha}_j(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}\t{(\ten{F}\times_{k\in[r]\backslash j}\mat{\alpha}_k)_{(j)}}
-\end{displaymath}
-where the normal equations for $\mat{\alpha}_j$ depend on all the other $\mat{\alpha}_k$. With the initial estimates from above this allows an alternating approach. Index with $t = 1, ...$ the current iteration, then a new estimate $\widehat{\mat{\alpha}}_j^{(t)}$ given the previous estimates $\widehat{\mat{\alpha}}_k^{(t-1)}$, $k = 1, ..., r$ is computed as
-\begin{displaymath}
-    \widehat{\mat{\alpha}}_j^{(t)} =
-        \ten{X}_{(j)}
-        \t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
-    \left(
-        \big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}
-        \t{\big(\ten{F}\times_{k\in[r]\backslash j}\widehat{\mat{\alpha}}_k^{(t-1)}\big)_{(j)}}
-    \right)^{-1}
-\end{displaymath}
-for $j = 1, ..., r$ until convergence or a maximum number of iterations is exceeded. The final estimates are the least squares estimates by this procedure.
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                            Numerical Examples                            %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Numerical Examples}
-% The first example (which by it self is \emph{not} exemplary) is the estimation with parameters $n = 200$, $p = 11$, $q = 5$, $k = 14$ and $r = 9$. The ``true'' matrices $\mat\alpha$, $\mat\beta$ generated by sampling there elements i.i.d. standard normal like the responses $y$. Then, for each observation, $\mat{f}_y$ is computed as $\sin(s_{i, j} y + o_{i j})$ \todo{ properly describe} to fill the elements of $\mat{f}_y$. Then the $\mat{X}$'s are samples as
-% \begin{displaymath}
-%     \mat{X} = \mat{\beta}\mat{f}_y \t{\mat{\alpha}} + \mat{\epsilon}, \qquad \vec{\mat{\epsilon}} \sim \mathbb{N}_{p q}(\mat{0}, \mat{\Delta})
-% \end{displaymath}
-% where $\mat{\Delta}_{i j} = 0.5^{|i - j|}$ for $i, j = 1, ..., p q$.
-
-\begin{table}[!ht]
-    \centering
-    % see: https://en.wikibooks.org/wiki/LaTeX/Tables
-    \begin{tabular}{ll|r@{ }l *{3}{r@{.}l}}
-          method & init
-            & \multicolumn{2}{c}{loss}
-            & \multicolumn{2}{c}{MSE}
-            & \multicolumn{2}{c}{$\dist(\hat{\mat\alpha}, \mat\alpha)$}
-            & \multicolumn{2}{c}{$\dist(\hat{\mat\beta}, \mat\beta)$}
-            \\ \hline
-          base     &  vlp & -2642&(1594) & 1&82  (2.714) & 0&248 (0.447) & 0&271 (0.458) \\
-          new      &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
-          new      &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
-          momentum &  vlp & -2704&(1452) & 1&78  (2.658) & 0&233 (0.438) & 0&260 (0.448) \\
-          momentum &   ls & -3479&  (95) & 0&99  (0.025) & 0&037 (0.017) & 0&035 (0.015) \\
-          approx   &  vlp &  6819&(1995) & 3&99 (12.256) & 0&267 (0.448) & 0&287 (0.457) \\
-          approx   &   ls &  5457& (163) & 0&99  (0.025) & 0&033 (0.017) & 0&030 (0.012) \\
-    \end{tabular}
-    \caption{Mean (standard deviation) for simulated runs of $20$ repititions for the model $\mat{X} = \mat{\beta}\mat{f}_y\t{\mat{\alpha}}$ of dimensinos $(p, q) = (11, 7)$, $(k, r) = (3, 5)$ with a sample size of $n = 200$. The covariance structure is $\mat{\Delta} = \mat{\Delta}_2\otimes \mat{\Delta}_1$ for $\Delta_i = \text{AR}(\sqrt{0.5})$, $i = 1, 2$. The functions applied to the standard normal response $y$ are $\sin, \cos$ with increasing frequency.}
-\end{table}
-
-% \begin{figure}
-%     \centering
-%     \includegraphics{loss_Ex01.png}
-% \end{figure}
-% \begin{figure}
-%     \centering
-%     \includegraphics{estimates_Ex01.png}
-% \end{figure}
-% \begin{figure}
-%     \centering
-%     \includegraphics{Delta_Ex01.png}
-% \end{figure}
-% \begin{figure}
-%     \centering
-%     \includegraphics{hist_Ex01.png}
-% \end{figure}
-
-\begin{table}[!ht]
-    \centering
-    \begin{tabular}{*{3}{l} | *{7}{c}}
-        method   & init &      npc  &  Acc &  Err &  FPR &  TPR &  FNR &  TNR &  AUC (sd) \\ \hline
-        base     &  vlp &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.047) \\
-        new      &  vlp &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.047) \\
-        new      &   ls &    (3, 4) & 0.74 & 0.26 & 0.51 & 0.88 & 0.12 & 0.49 & 0.77 (0.045) \\
-        ls       &      &    (3, 4) & 0.75 & 0.25 & 0.49 & 0.88 & 0.12 & 0.51 & 0.78 (0.044) \\
-        ls$^*$   &      &    (3, 4) & 0.78 & 0.22 & 0.38 & 0.87 & 0.13 & 0.62 & 0.86 (0.034) \\ %
-        LSIR     &      &    (3, 4) & 0.80 & 0.20 & 0.36 & 0.88 & 0.12 & 0.64 & 0.85 (0.036) \\
-        momentum &  vlp &    (3, 4) & 0.70 & 0.30 & 0.60 & 0.87 & 0.13 & 0.40 & 0.75 (0.047) \\
-        momentum &   ls &    (3, 4) & 0.70 & 0.30 & 0.58 & 0.87 & 0.13 & 0.42 & 0.76 (0.046) \\
-        approx   &  vlp &    (3, 4) & 0.68 & 0.32 & 0.62 & 0.86 & 0.14 & 0.38 & 0.74 (0.048) \\
-        approx   &   ls &    (3, 4) & 0.73 & 0.27 & 0.53 & 0.88 & 0.12 & 0.47 & 0.78 (0.044) \\ \hline
-        ls       &      &  (15, 15) & 0.75 & 0.25 & 0.47 & 0.87 & 0.13 & 0.53 & 0.78 (0.044) \\
-        ls$^*$   &      &  (15, 15) & 0.76 & 0.24 & 0.44 & 0.88 & 0.12 & 0.56 & 0.83 (0.039) \\ %
-        LSIR     &      &  (15, 15) & 0.72 & 0.28 & 0.44 & 0.82 & 0.18 & 0.56 & 0.81 (0.040) \\
-        approx   &   ls &  (15, 15) & 0.73 & 0.27 & 0.51 & 0.87 & 0.13 & 0.49 & 0.78 (0.044) \\ \hline
-        ls       &      &  (20, 30) & 0.75 & 0.25 & 0.47 & 0.87 & 0.13 & 0.53 & 0.78 (0.044) \\
-        ls$^*$   &      &  (20, 30) & 0.77 & 0.23 & 0.36 & 0.84 & 0.16 & 0.64 & 0.79 (0.045) \\ %
-        LSIR     &      &  (20, 30) & 0.79 & 0.21 & 0.36 & 0.87 & 0.13 & 0.64 & 0.83 (0.038) \\
-        approx   &   ls &  (20, 30) & 0.63 & 0.37 & 1.00 & 1.00 & 0.00 & 0.00 & 0.51 (0.053) \\ \hline
-        HOPCA    &      &           & 0.62 & 0.38 & 1    & 0.99 & 0.01 & 0    & 0.56 (0.053) \\
-        ls       &      &           & 0.75 & 0.25 & 0.44 & 0.87 & 0.13 & 0.56 & 0.78 (0.044) \\
-        ls$^*$   &      &           & 0.68 & 0.32 & 0.51 & 0.79 & 0.21 & 0.49 & 0.66 (0.054) \\ %
-        approx   &   ls &           & 0.75 & 0.25 & 0.44 & 0.87 & 0.13 & 0.56 & 0.78 (0.044) \\
-    \end{tabular}
-    \caption{\label{tab:eeg_sim}Recreation of the EEG data LOO-CV from \cite{sdr-PfeifferKaplaBura2021} Section~7, EEG Data and Table~6 with new methods. Column \emph{vlp} stands for the Van Loan and Pitsianis initialization scheme as described in \cite{sdr-PfeifferKaplaBura2021} and \emph{ls} is the approach described above. The column \emph{npc} gives the number of principal component of the $(2D)^2 PCA$ preprocessing. Reduction by $\ten{X}\times_{j\in[r]}\t{\widehat{\mat{\alpha}}_j}$ instread of $^*$ where reduction is $\ten{X}\times_{j\in[r]}\t{\widehat{\mat{\Delta}}_j^{-1}\widehat{\mat{\alpha}}_j}$.}
-\end{table}
-
-\begin{table}
-    \centering
-    \begin{tabular}{*{3}{l} | *{3}{r@{.}l}}
-        method   & init &       npc
-            & \multicolumn{2}{c}{elapsed}
-            & \multicolumn{2}{c}{sys.self}
-            & \multicolumn{2}{c}{user.self} \\ \hline
-        base     &      &    (3, 4) &  0&079 &   0&402 &   0&220 \\
-        new      &  vlp &    (3, 4) &  0&075 &   0&393 &   0&217 \\
-        new      &   ls &    (3, 4) &  0&218 &   0&243 &   0&305 \\
-        ls       &      &    (3, 4) &  0&003 &   0&006 &   0&006 \\
-        LSIR     &      &    (3, 4) &  0&002 &   0&000 &   0&002 \\
-        momentum &  vlp &    (3, 4) &  0&143 &   0&595 &   0&359 \\
-        momentum &   ls &    (3, 4) &  0&297 &   0&252 &   0&385 \\
-        approx   &  vlp &    (3, 4) &  0&044 &   0&240 &   0&152 \\
-        approx   &   ls &    (3, 4) &  0&066 &   0&144 &   0&121 \\ \hline
-        ls       &      &  (15, 15) &  0&012 &   0&059 &   0&034 \\
-        LSIR     &      &  (15, 15) &  0&010 &   0&050 &   0&031 \\
-        approx   &   ls &  (15, 15) &  0&813 &   3&911 &   2&325 \\ \hline
-        ls       &      &  (20, 30) &  0&028 &   0&129 &   0&080 \\
-        LSIR     &      &  (20, 30) &  0&030 &   0&142 &   0&100 \\
-        approx   &   ls &  (20, 30) &  2&110 &  10&111 &   6&290 \\ \hline
-        HOPCA    &      &           &  0&24  &   0&37  &   0&43  \\
-        ls       &      &           &  1&252 &   6&215 &   3&681 \\
-        approx   &   ls &           & 36&754 & 141&028 & 147&490 \\
-    \end{tabular}
-    \caption{\label{tab:eeg_time}Like Table~\ref{tab:eeg_sim} but reports the mean run-time.}
-\end{table}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                              Bib and Index                               %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\printindex
-\nocite{*}
-\printbibliography
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%                                 Appendix                                 %%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\appendix
-\section{Matrix Differential Rules}
-Let $\mat A$ be a square matrix (and invertible if needed) and $|.|$ stands for the determinant
-\begin{align*}
-    \d\log\mat A &= \frac{1}{|\mat A|}\d\mat{A} \\
-    \d|\mat A| &= |\mat A|\tr \mat{A}^{-1}\d\mat A \\
-    \d\log|\mat A| &= \tr\mat{A}^{-1}\d\mat A \\
-    \d\mat{X}^{-1} &= -\mat{X}^{-1}(\d\mat{X})\mat{X}^{-1}
-\end{align*}
-
-\section{Useful Matrix Identities}
-In this section we summarize a few useful matrix identities, for more details see for example \cite{MatrixAlgebra-AbadirMagnus2005}.
-
-For two matrices $\mat A$ of dimensions $q\times r$ and $\mat B$ of dimensions $p\times k$ holds
-\begin{equation}\label{eq:vecKron}
-    \vec(\mat A\kron\mat B) = (\mat{I}_r\kron\mat{K}_{k,q}\kron\mat{I}_p)(\vec\mat A\kron\vec\mat B).
-\end{equation}
-
-Let $\mat A$ be a $p\times p$ dimensional non-singular matrix. Furthermore, let $\mat a, \mat b$ be $p$ vectors such that $\t{\mat b}A^{-1}\mat a\neq -1$, then
-\begin{displaymath}
-    (\mat A + \mat a\t{\mat b})^{-1} = \mat{A}^{-1} - \frac{1}{1 + \t{\mat b}A^{-1}\mat a}\mat{A}^{-1}\mat{a}\t{\mat{b}}\mat{A}^{-1}
-\end{displaymath}
-as well as
-\begin{displaymath}
-    \det(\mat A + \mat a\t{\mat b}) = \det(\mat A)(1 + \t{\mat b}{\mat A}^{-1}\mat a)
-\end{displaymath}
-which even holds in the case $\t{\mat b}A^{-1}\mat a = -1$. This is known as Sylvester's determinant theorem.
-
-
-\section{Commutation Matrix and Permutation Identities}
-\begin{center}
-    Note: In this section we use 0-indexing for the sake of simplicity!
-\end{center}
-In this section we summarize relations between the commutation matrix and corresponding permutation. We also list some extensions to ``simplify'' or represent some term. This is mostly intended for implementation purposes and understanding of terms occurring in the computations.
-
-Let $\mat A$ be an arbitrary $p\times q$ matrix. The permutation matrix $\mat K_{p, q}$ satisfies
-\begin{displaymath}
-    \mat{K}_{p, q}\vec{\mat{A}} = \vec{\t{\mat{A}}} \quad\Leftrightarrow\quad (\vec{\mat{A}})_{\pi_{p, q}(i)} = (\vec{\t{\mat{A}}})_{i}, \quad\text{for } i = 0, ..., p q - 1
-\end{displaymath}
-where $\pi_{p, q}$ is a permutation of the indices $i = 0, ..., p q - 1$ such that
-\begin{displaymath}
-    \pi_{p, q}(i + j p) = j + i q, \quad\text{for }i = 0, ..., p - 1; j = 0, ..., q - 1.
-\end{displaymath}
-
-\begin{table}[!htp]
-    \centering
-    \begin{minipage}{0.8\textwidth}
-        \centering
-        \begin{tabular}{l c l}
-            $\mat{K}_{p, q}$                 & $\hat{=}$ & $\pi_{p, q}(i + j p) = j + i q$ \\
-            $\mat{I}_r\kron\mat{K}_{p, q}$ & $\hat{=}$ & $\tilde{\pi}_{p, q, r}(i + j p + k p q) = j + i q + k p q$ \\
-            $\mat{K}_{p, q}\kron\mat{I}_r$ & $\hat{=}$ & $\hat{\pi}_{p, q, r}(i + j p + k p q) = r(j + i q) + k$
-        \end{tabular}
-    \caption{\label{tab:commutation-permutation}Commutation matrix terms and corresponding permutations. Indices are all 0-indexed with the ranges; $i = 0, ..., p - 1$, $j = 0, ..., q - 1$ and $k = 0, ..., r - 1$.}
-    \end{minipage}
-\end{table}
-
-
-
-\section{Matrix and Tensor Operations}
-
-The \emph{Kronecker product}\index{Operations!Kronecker@$\kron$ Kronecker product} is denoted as $\kron$ and the \emph{Hadamard product} uses the symbol $\circ$. We also need the \emph{Khatri-Rao product}\index{Operations!KhatriRao@$\hada$ Khatri-Rao product}
-$\hada$ as well as the \emph{Transposed Khatri-Rao product} $\odot_t$ (or \emph{Face-Splitting product}). There is also the \emph{$n$-mode Tensor Matrix Product}\index{Operations!ttm@$\ttm[n]$ $n$-mode tensor product} denoted by $\ttm[n]$ in conjunction with the \emph{$n$-mode Matricization} of a Tensor $\mat{T}$ written as $\mat{T}_{(n)}$, which is a matrix. See below for definitions and examples of these operations.\todo{ Definitions and Examples}
-
-\todo{ resolve confusion between Khatri-Rao, Column-wise Kronecker / Khatri-Rao, Row-wise Kronecker / Khatri-Rao, Face-Splitting Product, .... Yes, its a mess.}
-\paragraph{Kronecker Product $\kron$:}
-\paragraph{Khatri-Rao Product $\hada$:}
-\paragraph{Transposed Khatri-Rao Product $\odot_t$:} This is also known as the Face-Splitting Product and is the row-wise Kronecker product of two matrices. If relates to the Column-wise Kronecker Product through
-\begin{displaymath}
-    \t{(\mat{A}\odot_{t}\mat{B})} = \t{\mat{A}}\hada\t{\mat{B}}
-\end{displaymath}
-
-\paragraph{$n$-mode unfolding:} \emph{Unfolding}, also known as \emph{flattening} or \emph{matricization}, is an reshaping of a tensor into a matrix with rearrangement of the elements such that mode $n$ corresponds to columns of the result matrix and all other modes are vectorized in the rows. Let $\ten{T}$ be a tensor of order $m$ with dimensions $t_1\times ... \times t_n\times ... \times t_m$ and elements indexed by $(i_1, ..., i_n, ..., i_m)$. The $n$-mode flattening, denoted $\ten{T}_{(n)}$, is defined as a $(t_n, \prod_{k\neq n}t_k)$ matrix with element indices $(i_n, j)$ such that $j = \sum_{k = 1, k\neq n}^m i_k\prod_{l = 1, l\neq n}^{k - 1}t_l$.
-\todo{ give an example!}
-
-\paragraph{$n$-mode Tensor Product $\ttm[n]$:}
-The \emph{$n$-mode tensor product} $\ttm[n]$ between a tensor $\mat{T}$ of order $m$ with dimensions $t_1\times t_2\times ... \times t_n\times ... \times t_m$ and a $p\times t_n$ matrix $\mat{M}$ is defined element-wise as
-\begin{displaymath}
-    (\ten{T}\ttm[n] \mat{M})_{i_1, ..., i_{n-1}, j, i_{n+1}, ..., i_m} = \sum_{k = 1}^{t_n} \ten{T}_{i_1, ..., i_{n-1}, k, i_{n+1}, ..., i_m} \mat{M}_{j, k}
-\end{displaymath}
-where $i_1, ..., i_{n-1}, i_{n+1}, ..., i_m$ run from $1$ to $t_1, ..., t_{n-1}, t_{n+1}, ..., t_m$, respectively. Furthermore, the $n$-th fiber index $j$ of the product ranges from $1$ to $p$. This gives a new tensor $\mat{T}\ttm[n]\mat{M}$ of order $m$ with dimensions $t_1\times t_2\times ... \times p\times ... \times t_m$.
-
-\begin{example}[Matrix Multiplication Analogs]
-    Let $\mat{A}$, $\mat{B}$ be two matrices with dimensions $t_1\times t_2$ and $p\times q$, respectively. Then $\mat{A}$ is also a tensor of order $2$, now the $1$-mode and $2$-mode products are element wise given by
-    \begin{align*}
-        (\mat{A}\ttm[1] \mat{B})_{i,j} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j}\mat{B}_{i,l}
-            = (\mat{B}\mat{A})_{i,j}
-                                        & \text{for }t_1 = q, \\
-        (\mat{A}\ttm[2] \mat{B})_{i,j} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l}\mat{B}_{j,l}
-            = (\mat{A}\t{\mat{B}})_{i,j} = \t{(\mat{B}\t{\mat{A}})}_{i,j}
-                                        & \text{for }t_2 = q.
-    \end{align*}
-    In other words, the $1$-mode product equals $\mat{A}\ttm[1] \mat{B} = \mat{B}\mat{A}$ and the $2$-mode is $\mat{A}\ttm[2] \mat{B} = \t{(\mat{B}\t{\mat{A}})}$ in the case of the tensor $\mat{A}$ being a matrix.
-\end{example}
-
-\begin{example}[Order Three Analogs]
-    Let $\mat{A}$ be a tensor of the form $t_1\times t_2\times t_3$ and $\mat{B}$ a matrix of dimensions $p\times q$, then the $n$-mode products have the following look
-    \begin{align*}
-        (\mat{A}\ttm[1]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_1} \mat{A}_{l,j,k}\mat{B}_{i,l} & \text{for }t_1 = q, \\
-        (\mat{A}\ttm[2]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_2} \mat{A}_{i,l,k}\mat{B}_{j,l} \equiv (\mat{B}\mat{A}_{i,:,:})_{j,k} & \text{for }t_2 = q, \\
-        (\mat{A}\ttm[3]\mat{B})_{i,j,k} &= \sum_{l = 1}^{t_3} \mat{A}_{i,j,l}\mat{B}_{k,l} \equiv \t{(\mat{B}\t{\mat{A}_{i,:,:}})}_{j,k} & \text{for }t_3 = q.
-    \end{align*}
-\end{example}
-
-Letting $\ten{F}$ be the $3$-tensor of dimensions $n\times k\times r$ such that $\ten{F}_{i,:,:} = \mat{f}_{y_i}$, then
-\begin{displaymath}
-    \mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}} = (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{i,:,:}
-\end{displaymath}
-or in other words, the $i$-th slice of the tensor product $\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha}$ contains $\mat{\beta}\mat{f}_{y_i}\t{\mat{\alpha}}$ for $i = 1, ..., n$.
-Another analog way of writing this is
-\begin{displaymath}
-    (\ten{F}\ttm[2]\mat{\beta}\ttm[3]\mat{\alpha})_{(1)} = \mathbb{F}_{y}(\t{\mat{\alpha}}\kron\t{\mat{\beta}})
-\end{displaymath}
-
-\section{Equivalencies}
-In this section we give a short summary of alternative but equivalent operations.
-Using the notation $\widehat{=}$ to indicate that two expressions are identical in the sense that they contain the same element in the same order but may have different dimensions. Meaning, when vectorizing ether side of $\widehat{=}$, they are equal ($\mat{A}\widehat{=}\mat{B}\ :\Leftrightarrow\ \vec{\mat{A}} = \vec{\mat{B}}$).
-
-Therefore, we use $\mat{A}, \mat{B}, \mat{X}, \mat{F}, \mat{R}, ...$ for matrices. 3-Tensors are written as $\ten{A}, \ten{B}, \ten{T}, \ten{X}, \ten{F}, \ten{R}, ...$.
-
-\begin{align*}
-    \ten{T}\ttm[3]\mat{A}\ &{\widehat{=}}\ \mat{T}\t{\mat A} & \ten{T}(n, p, q)\ \widehat{=}\ \mat{T}(n p, q), \mat{A}(p, q) \\
-    \ten{T}\ttm[2]\mat{B}\ &{\widehat{=}}\ \mat{B}\ten{T}_{(2)} & \ten{T}(n, p, q), \ten{T}_{(2)}(p, n q), \mat{B}(q, p)
-\end{align*}
-
-% \section{Matrix Valued Normal Distribution}
-% A random variable $\mat{X}$ of dimensions $p\times q$ is \emph{Matrix-Valued Normal Distribution}, denoted 
-% \begin{displaymath}
-%     \mat{X}\sim\mathcal{MN}_{p\times q}(\mat{\mu}, \mat{\Delta}_2, \mat{\Delta}_1),
-% \end{displaymath}
-% if and only if $\vec\mat{X}\sim\mathcal{N}_{p q}(\vec\mat\mu, \mat\Delta_1\otimes\mat\Delta_2)$. Note the order of the covariance matrices $\mat\Delta_1, \mat\Delta_2$. Its density is given by
-% \begin{displaymath}
-%     f(\mat{X}) = \frac{1}{(2\pi)^{p q / 2}|\mat\Delta_1|^{p / 2}|\mat\Delta_2|^{q / 2}}\exp\left(-\frac{1}{2}\tr(\mat\Delta_1^{-1}\t{(\mat X - \mat \mu)}\mat\Delta_2^{-1}(\mat X - \mat \mu))\right).
-% \end{displaymath}
-
-% \section{Sampling form a Multi-Array Normal Distribution}
-% Let $\ten{X}$ be an order (rank) $r$ Multi-Array random variable of dimensions $p_1\times...\times p_r$ following a Multi-Array (or Tensor) Normal distributed
-% \begin{displaymath}
-%     \ten{X}\sim\mathcal{TN}(\mu, \mat{\Delta}_1, ..., \mat{\Delta}_r).
-% \end{displaymath}
-% Its density is given by
-% \begin{displaymath}
-%     f(\ten{X}) = \Big( \prod_{i = 1}^r \sqrt{(2\pi)^{p_i}|\mat{\Delta}_i|^{q_i}} \Big)^{-1}
-%         \exp\!\left( -\frac{1}{2}\langle \ten{X} - \mu, (\ten{X} - \mu)\times\{\mat{\Delta}_1^{-1}, ..., \mat{\Delta}_r^{-1}\} \rangle \right)
-% \end{displaymath}
-% with $q_i = \prod_{j \neq i}p_j$. This is equivalent to the vectorized $\vec\ten{X}$ following a Multi-Variate Normal distribution
-% \begin{displaymath}
-%     \vec{\ten{X}}\sim\mathcal{N}_{p}(\vec{\mu}, \mat{\Delta}_r\otimes...\otimes\mat{\Delta}_1)
-% \end{displaymath}
-% with $p = \prod_{i = 1}^r p_i$.
-
-% \todo{Check this!!!}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%                           Reference Summaries                            %%%
diff --git a/README.md b/README.md
deleted file mode 100644
index 7eed2fe..0000000
--- a/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# tensor_predictors
-
-Implementation of methods and simulation source for "Least Squares and Maximum Likelihood Estimation of Sufficient Reductions in Regressions with Matrix Valued Predictors".
\ No newline at end of file
diff --git a/mvbernoulli/DESCRIPTION b/mvbernoulli/DESCRIPTION
new file mode 100644
index 0000000..1fc0b76
--- /dev/null
+++ b/mvbernoulli/DESCRIPTION
@@ -0,0 +1,16 @@
+Package: mvbernoulli
+Type: Package
+Title: Multivariate Bernoulli Regression
+Version: 1.0
+Date: 2022-07-19
+Author: Daniel Kapla
+Maintainer: Daniel Kapla <daniel@kapla.at>
+Description: Implements regression routines for the general Multivariate
+        Bernoulli and the special case of the Ising model with and without
+        regressors.
+License: GPL (>= 2)
+Imports: Rcpp (>= 1.0.8)
+SystemRequirements: C++17
+LinkingTo: Rcpp
+Encoding: UTF-8
+RoxygenNote: 7.1.1
diff --git a/mvbernoulli/NAMESPACE b/mvbernoulli/NAMESPACE
new file mode 100644
index 0000000..4ed8739
--- /dev/null
+++ b/mvbernoulli/NAMESPACE
@@ -0,0 +1,7 @@
+useDynLib(mvbernoulli, .registration=TRUE)
+importFrom(Rcpp, evalCpp)
+exportPattern("^[[:alpha:]]+")
+S3method(print, mvbinary)
+S3method(mean, mvbinary)
+S3method(cov, mvbinary)
+S3method("[", mvbinary)
diff --git a/mvbernoulli/R/RcppExports.R b/mvbernoulli/R/RcppExports.R
new file mode 100644
index 0000000..80e3df2
--- /dev/null
+++ b/mvbernoulli/R/RcppExports.R
@@ -0,0 +1,162 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+ising_log_odds_sum <- function(y, theta) {
+    .Call(`_mvbernoulli_ising_log_odds_sum`, y, theta)
+}
+
+#' Ising model scaling factor `p_0(theta)` for the Ising model
+#'
+ising_prob0 <- function(theta) {
+    .Call(`_mvbernoulli_ising_prob0`, theta)
+}
+
+#' Ising model probabilities for every event, this returns a vector of size `2^p`
+#' with indices corresponding to the events binary representation.
+#'
+#' Note: the R indexing leads to adding +1 to every index.
+#'
+ising_probs <- function(theta) {
+    .Call(`_mvbernoulli_ising_probs`, theta)
+}
+
+#' Computes the zero conditioned probabilities
+#'      P(Y_i = 1 | Y_-i = 0)
+#' and
+#'      P(Y_i = 1, Y_j = 1 | Y_-i = 0)
+#' from the natural parameters `theta` with matching indexing.
+#'
+#' This is the inverse function of `ising_theta_from_cond_prob`
+#'
+ising_cond_probs <- function(theta) {
+    .Call(`_mvbernoulli_ising_cond_probs`, theta)
+}
+
+#' Computes the expectation of `Y` under the Ising model with natural parameter
+#' `theta` given component wise by
+#'
+#'      E Y_i = P(Y_i = 1)
+#'
+ising_expectation <- function(theta) {
+    .Call(`_mvbernoulli_ising_expectation`, theta)
+}
+
+#' Computes the covariance (second centered moment) of `Y` under the Ising model
+#' with natural parameter `theta`.
+#'
+#'      cov(Y, Y) = E[Y Y'] - E[Y] E[Y]'
+#'
+ising_cov <- function(theta) {
+    .Call(`_mvbernoulli_ising_cov`, theta)
+}
+
+#' Computes the single and two way effect marginal probabilities
+#'
+#'      P(Y_i = 1)
+#' and
+#'      P(Y_i Y_j = 1)
+#'
+#' In its vectorized form this function computes E[vech(Y Y')]
+#'
+ising_marginal_probs <- function(theta) {
+    .Call(`_mvbernoulli_ising_marginal_probs`, theta)
+}
+
+#' Natural parameters from the sufficient conditional probability statistis `pi`
+#'
+#' Computes the natural parameters `theta` of the Ising model from zero
+#' conditioned probabilites for single and two way effects.
+#'
+#' This is the inverse function of `ising_cond_prob_from_theta`
+#'
+ising_theta_from_cond_prob <- function(pi) {
+    .Call(`_mvbernoulli_ising_theta_from_cond_prob`, pi)
+}
+
+#' Computes the log-lokelihood at natural parameters `theta` of the Ising model
+#' given a set of observations `Y`
+#'
+#'      l(theta) = log(p_0(theta)) + n^-1 sum_i vech(y_i y_i')' theta
+#'
+ising_log_likelihood <- function(theta, Y) {
+    .Call(`_mvbernoulli_ising_log_likelihood`, theta, Y)
+}
+
+#' Computes the Score of the Ising model, this is the gradiend of the (mean)
+#' log-likelihood with respect to the natural parameters
+#'
+#'      grad l(theta) = -E[vech(Y Y')] + n^-1 sum_i vech(y_i y_i')
+#'
+ising_score <- function(theta, Y) {
+    .Call(`_mvbernoulli_ising_score`, theta, Y)
+}
+
+ising_conditional_log_likelihood <- function(alpha, X, Y) {
+    .Call(`_mvbernoulli_ising_conditional_log_likelihood`, alpha, X, Y)
+}
+
+ising_conditional_score <- function(alpha, X, Y) {
+    .Call(`_mvbernoulli_ising_conditional_score`, alpha, X, Y)
+}
+
+#' Fisher information for the natural parameters `theta` under the Ising model.
+#'
+#'      I(theta) = -E(H l(theta) | theta) = Cov(vech(Y Y'), vech(Y Y') | theta)
+#'
+#' where `H l(theta)` is the Hessian of the log-likelihood `l(theta)` defined as
+#'
+#'      l(theta) = n^-1 prod_i P(Y = y_i | theta)
+#'               = log(p_0(theta)) - mean_i exp(vech(y_i y_i')' theta)
+#'
+#' Note that the Fisher information does _not_ depend on data.
+#'
+ising_fisher_info <- function(theta) {
+    .Call(`_mvbernoulli_ising_fisher_info`, theta)
+}
+
+#' Samples from the Ising model given the natural parameters `theta`
+#'
+ising_sample <- function(n, theta, warmup = 1000L) {
+    .Call(`_mvbernoulli_ising_sample`, n, theta, warmup)
+}
+
+print.mvbinary <- function(binary, nrLines = 10L) {
+    invisible(.Call(`_mvbernoulli_print_mvbinary`, binary, nrLines))
+}
+
+printBits <- function(ints) {
+    invisible(.Call(`_mvbernoulli_printBits`, ints))
+}
+
+#' Converts a logical matrix to a multi variate bernoulli dataset
+#'
+as.mvbinary <- function(Y) {
+    .Call(`_mvbernoulli_as_mvbinary`, Y)
+}
+
+#' Converts a Multivariate binary data set into a logical matrix
+#'
+as.mvbmatrix <- function(Y) {
+    .Call(`_mvbernoulli_as_mvbmatrix`, Y)
+}
+
+#' Mean for a multi variate bernoulli dataset `MVBinary`
+#'
+#'      mean_i y_i                  # twoway = false (only single effects)
+#'
+#' or
+#'
+#'      mean_i vech(y_i y_i')       # twoway = true (with two-way interactions)
+#'
+mean.mvbinary <- function(Y, twoway = FALSE) {
+    .Call(`_mvbernoulli_mean_mvbinary`, Y, twoway)
+}
+
+#' Covariance for multi variate binary data `MVBinary`
+#'
+#'      cov(Y) = (n - 1)^-1 sum_i (y_i - mean(Y)) (y_i - mean(Y))'
+#'
+cov.mvbinary <- function(Y) {
+    .Call(`_mvbernoulli_cov_mvbinary`, Y)
+}
+
diff --git a/mvbernoulli/R/extract.R b/mvbernoulli/R/extract.R
new file mode 100644
index 0000000..3dc2b0c
--- /dev/null
+++ b/mvbernoulli/R/extract.R
@@ -0,0 +1,12 @@
+#' Extract parts of an `mvbinary` data set
+#'
+#' @param obj object of class `mvbinary`
+#' @param drop if true, the returned object is only a integer vector
+#'
+#' @return object of class `mvbinary`
+#'
+#' @export
+`[.mvbinary` <- function(obj, i) {
+    # subset integer vector and set class and dimension attributes
+    structure(c(obj)[i], class = "mvbinary", p = attr(obj, "p"))
+}
diff --git a/mvbernoulli/inst/examples/ising_grad.R b/mvbernoulli/inst/examples/ising_grad.R
new file mode 100644
index 0000000..5312c21
--- /dev/null
+++ b/mvbernoulli/inst/examples/ising_grad.R
@@ -0,0 +1,363 @@
+library(mvbernoulli)
+
+printMVBinary <- function(Y) {
+    Y <- array(as.integer(Y), dim = dim(Y))
+    eventIndex <- seq_len(nrow(Y))
+    eventNr <- apply(Y, 1, function(y) sum(y * 2^(rev(seq_len(p)) - 1)))
+    dimnames(Y) <- list(
+        "Index/Event" = paste(eventIndex, eventNr, sep = "/"),
+        "Bit Index" = as.character(rev(seq_len(p)) - 1)
+    )
+    print.table(Y, zero.print = ".")
+}
+
+
+n <- 100
+p <- 6
+
+(theta <- rnorm(p * (p + 1) / 2))
+
+pi <- ising_cond_probs(theta)
+all.equal(
+    theta,
+    ising_theta_from_cond_prob(pi)
+)
+
+tensorPredictors::matrixImage({
+    Theta <- matrix(NA, p, p)
+    Theta[lower.tri(Theta, diag = TRUE)] <- theta
+    Theta[upper.tri(Theta)] <- t(Theta)[upper.tri(Theta)]
+    Theta
+}, main = expression(paste("natural Params ", Theta)))
+tensorPredictors::matrixImage({
+    PI <- matrix(NA, p, p)
+    PI[lower.tri(PI, diag = TRUE)] <- ising_cond_probs(theta)
+    PI[upper.tri(PI)] <- t(PI)[upper.tri(PI)]
+    PI
+}, main = expression(paste("Conditional Probs. P(", Y[i], " = ", Y[j], " = 1", " | ", Y[-i - j], " = ", 0, ")")))
+tensorPredictors::matrixImage({
+    MAR <- matrix(NA, p, p)
+    MAR[lower.tri(MAR, diag = TRUE)] <- ising_marginal_probs(theta)
+    MAR[upper.tri(MAR)] <- t(MAR)[upper.tri(MAR)]
+    MAR
+}, main = expression(paste("Marginal Probs. P(", Y[i], " = ", Y[j], " = 1)")))
+
+Y <- matrix(sample(c(TRUE, FALSE), n * p, replace = TRUE), n)
+printMVBinary(Y)
+
+
+allY <- function(p) {
+    events <- c(FALSE, TRUE)
+    for (. in seq_len(p - 1)) {
+        events <- rbind(
+            cbind(FALSE, events),
+            cbind( TRUE, events)
+        )
+    }
+    events
+}
+printMVBinary(allY(p))
+
+G <- ising_score(theta, Y)
+# Numeric gradiend
+log.likelihood <- function(theta, Y) {
+    p <- ncol(Y)
+    # check sizes
+    stopifnot(p * (p + 1) == 2 * length(theta))
+    # and reverse column order
+    # this is needed as internally the left are the high bits (high index) and
+    # the right are the low bits (low index) which means for matching indices
+    # we need to reverse the column order
+    Y <- Y[, rev(seq_len(p)), drop = FALSE]
+    # calc scaling factor
+    sum_0 <- sum(exp(
+        theta %*% apply(allY(p), 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
+    ))
+    # evaluate log likelihood
+    -log(sum_0) + mean(
+        theta %*% apply(Y, 1, function(y) outer(y, y, `&`))[lower.tri(diag(p), diag = TRUE), ]
+    )
+}
+
+G.num <- local({
+    h <- 1e-6
+    mapply(function(i) {
+        delta <- h * (seq_along(theta) == i)
+        (log.likelihood(theta + delta, Y) - log.likelihood(theta - delta, Y)) / (2 * h)
+    }, seq_along(theta))
+})
+
+data.frame(G, G.num)
+
+
+for (n in c(2, 7, 12, 13, 14)) {
+    for (p in 1:4) {
+        cat(sprintf("%6d / %6d\n", sum(mapply(choose, n, 0:p)), nrSubSets(n, p)))
+    }
+}
+
+
+p <- 5
+(A <- tcrossprod(apply(allY(p), 1, function(y) outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)])))
+
+print.table(B <- ising_fisher_info(theta), zero.print = ".")
+
+all.equal(A[lower.tri(A, TRUE)], B[lower.tri(B, TRUE)])
+
+ising_fisher_info.R <- function(theta, p) {
+    stopifnot(2 * length(theta) == p * (p + 1))
+
+    Y <- allY(p)
+
+    # Ising model scaling factor for `P(Y = y) = p_0 exp(vech(y y')' theta)`
+    sum_0 <- sum(apply(Y, 1, function(y) {
+        vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
+        exp(sum(vechYY * theta))
+    }))
+    p_0 <- 1 / sum_0
+
+    # E[vech(Y Y')]
+    EvechYY <- p_0 * rowSums(apply(Y, 1, function(y) {
+        vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
+        exp(sum(vechYY * theta)) * vechYY
+    }))
+
+    # E[vech(Y Y') vech(Y Y')']
+    EvechYYvechYY <- p_0 * matrix(rowSums(apply(Y, 1, function(y) {
+        vechYY <- outer(y, y, `&`)[lower.tri(diag(p), diag = TRUE)]
+        exp(sum(vechYY * theta)) * outer(vechYY, vechYY)
+    })), p * (p + 1) / 2)
+
+    # Cov(vech(Y Y'), vech(Y Y')) = E[vech(Y Y') vech(Y Y')'] - E[vech(Y Y')] E[vech(Y Y')]'
+    EvechYYvechYY - outer(EvechYY, EvechYY)
+}
+
+all.equal(
+    ising_fisher_info.R(theta, p),
+    ising_fisher_info(theta)
+)
+
+p <- 10
+theta <- rnorm(p * (p + 1) / 2)
+microbenchmark::microbenchmark(
+    ising_fisher_info.R(theta, p),
+    ising_fisher_info(theta)
+)
+
+
+ising_fisher_scoring <- function(Y) {
+    # initial estimate (guess)
+    ltri <- which(lower.tri(diag(p), diag = TRUE))
+    theta <- ising_theta_from_cond_prob(rowMeans(apply(Y, 1, function(y) outer(y, y, `&`)[ltri])))
+
+    print(theta)
+
+    ll <- log.likelihood(theta, Y)
+
+    # iterate Fisher scoring
+    for (iter in 1:20) {
+        theta <- theta + solve(ising_fisher_info(theta), ising_score(theta, Y))
+
+        ll <- c(ll, log.likelihood(theta, Y))
+
+        cat("ll: ", tail(ll, 1), "\n")
+    }
+
+    theta
+}
+ising_fisher_scoring(Y)
+
+
+
+microbenchmark::microbenchmark(
+    cov.mvbinary(Y),        # double copy (TODO: change MVBinary conversion/SEXP binding)
+    cov(Y),                 # call the next expr. through default args
+    .Call(stats:::C_cov, Y, NULL, na.method = 4L, FALSE)
+)
+
+
+################################################################################
+###                         Conditional Ising Model                          ###
+################################################################################
+n <- 1000
+p <- 10
+q <- 10
+
+alpha <- matrix(rnorm(p * q), p)
+X <- matrix(rnorm(n * p), n)
+theta <- function(alpha, x) {
+    Theta <- crossprod(crossprod(x, alpha))
+    diag(Theta) <- 0.5 * diag(Theta)
+    2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
+}
+# sample Y ~ P( . | X = x) for x in X
+system.time(Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha, x))))
+attr(Y, "p") <- as.integer(q)
+class(Y) <- "mvbinary"
+
+# For the numeric gradient comparison
+allY <- function(p) {
+    events <- c(FALSE, TRUE)
+    for (. in seq_len(p - 1)) {
+        events <- rbind(
+            cbind(FALSE, events),
+            cbind( TRUE, events)
+        )
+    }
+    events
+}
+ising_conditional_log_likelihood.R <- function(alpha, X, Y) {
+    # convert Y to a binary matrix
+    Y <- as.mvbmatrix(Y)
+    #retrieve dimensions
+    n <- nrow(X)
+    p <- ncol(X)
+    q <- ncol(Y)
+    # check dimensions
+    stopifnot({
+        nrow(Y) == n
+        all(dim(alpha) == c(p, q))
+    })
+
+    # setup reused internal variables
+    vech_index <- which(lower.tri(diag(q), diag = TRUE))
+    aaY <- apply(allY(q), 1, function(y) outer(y, y, `&`))[vech_index, ]
+
+    # sum over all observations
+    ll <- 0
+    for (i in seq_len(n)) {
+        # Theta = alpha' x x' alpha
+        Theta <- crossprod(crossprod(X[i, ], alpha))
+        # theta = vech((2 1_q 1_q' - I_q) o Theta)
+        theta <- ((2 - diag(q)) * Theta)[vech_index]
+
+        # scaling factor `p_0^-1 = sum_y exp(vech(y y')' theta)`
+        sum_0 <- sum(exp(theta %*% aaY))
+
+        print(log(sum_0))
+
+        # evaluate log likelihood
+        ll <- ll + sum(theta * outer(Y[i, ], Y[i, ], `&`)[vech_index]) - log(sum_0)
+    }
+
+    ll / n
+}
+# numeric gradiend (score of the log-likelihood)
+ising_conditional_score.R <- function(alpha, X, Y, h = 1e-6) {
+    matrix(mapply(function(i) {
+        delta <- h * (seq_along(alpha) == i)
+        (ising_conditional_log_likelihood.R(alpha + delta, X, Y) -
+            ising_conditional_log_likelihood.R(alpha - delta, X, Y)) / (2 * h)
+    }, seq_along(alpha)), nrow(alpha))
+}
+
+stopifnot(all.equal(
+    ising_conditional_log_likelihood.R(alpha, X, Y),
+    ising_conditional_log_likelihood(alpha, X, Y)
+))
+microbenchmark::microbenchmark(
+    ising_conditional_log_likelihood.R(alpha, X, Y),
+    ising_conditional_log_likelihood(alpha, X, Y)
+)
+
+stopifnot(all.equal(
+    ising_conditional_score.R(alpha, X, Y),
+    ising_conditional_score(alpha, X, Y)
+))
+microbenchmark::microbenchmark(
+    ising_conditional_score.R(alpha, X, Y),
+    ising_conditional_score(alpha, X, Y)
+)
+
+################################################################################
+###                       Fit Conditional Ising Model                        ###
+################################################################################
+
+ising_conditional_fit <- function(X, Y, ..., callback = NULL) {
+    # get and check dimensions
+    n <- if (is.null(nrow(Y))) length(Y) else nrow(Y)
+    p <- ncol(X)
+    q <- if (is.null(ncol(Y))) attr(Y, "p") else ncol(Y)
+    # check dimensions
+    stopifnot(nrow(X) == n)
+
+    ### Initial value estimate
+    # SVD of the predictor covariance estimate `Sigma = U_Sigma D_Sigma U_Sigma'`
+    SigmaSVD <- La.svd(cov(X), min(p, q), 0)
+
+    # Estimate `pi` as the single and two way effect means (approx conditional
+    # probabilities through the marginal probability estimate)
+    pi <- mean.mvbinary(Y, twoway = TRUE)
+
+    # convert conditional probabilities into natural parameters (log-odds)
+    theta <- ising_theta_from_cond_prob(pi)
+
+    # convert natural parameters `theta` to square matrix form `Theta`
+    Theta <- matrix(NA, q, q)
+    Theta[lower.tri(diag(q), diag = TRUE)] <- theta
+    Theta[upper.tri(diag(q))] <- t(Theta)[upper.tri(diag(q))]
+    Theta <- (0.5 + diag(0.5, q, q)) * Theta
+
+    # SVD of `Theta`
+    ThetaSVD <- La.svd(Theta, min(p, q), 0)
+
+    # Finally, initial `alpha` parameter estimate
+    #   `alpha_0 = U_Sigma D_Sigma^-1/2 D_Theta^1/2 U_Theta'`
+    alpha <- with(list(S = SigmaSVD, T = ThetaSVD), {
+        S$u %*% diag(sqrt(T$d[seq_len(min(p, q))] / S$d[seq_len(min(p, q))])) %*% t(T$u)
+    })
+
+    ### Optimize log-likelihood for `alpha`
+    tensorPredictors::NAGD(
+        fun.loss = function(alpha) -ising_conditional_log_likelihood(alpha, X, Y),
+        fun.grad = function(alpha) -ising_conditional_score(alpha, X, Y),
+        params = alpha,
+        ...,
+        callback = callback
+    )
+}
+
+n <- 1000
+p <- 7
+q <- 9
+
+alpha.true <- matrix(rnorm(p * q), p)
+X <- matrix(rnorm(n * p), n)
+theta <- function(alpha, x) {
+    Theta <- crossprod(crossprod(x, alpha))
+    diag(Theta) <- 0.5 * diag(Theta)
+    2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
+}
+# sample Y ~ P( . | X = x) for x in X
+Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha.true, x)))
+attr(Y, "p") <- as.integer(q)
+
+max.iter <- 100L
+ising_conditional_fit(X, Y, max.iter = max.iter, callback = function(iter, alpha) {
+    cat(sprintf(
+        "%4d/%4d - diff: %12.4f - ll: %12.4f\n",
+        iter, max.iter,
+        min(norm(alpha - alpha.true, "F"), norm(alpha + alpha.true, "F")),
+        ising_conditional_log_likelihood(alpha, X, Y)
+    ))
+})
+
+
+ising_conditional_log_likelihood(alpha.true, X, Y)
+ising_conditional_log_likelihood.R(alpha.true, X, Y)
+
+for (. in 1:10) {
+    print(ising_conditional_log_likelihood(matrix(rnorm(p * q), p, q), X, Y))
+}
+
+YY <- as.mvbmatrix(Y)
+microbenchmark::microbenchmark(
+    mean.mvbinary(Y, twoway = TRUE),
+    rowMeans(apply(YY, 1, function(y) outer(y, y, `&`)))[lower.tri(diag(q), diag = TRUE)]
+)
+
+par(mfrow = c(2, 2))
+tensorPredictors::matrixImage(alpha)
+tensorPredictors::matrixImage(alpha.true)
+tensorPredictors::matrixImage(alpha)
+tensorPredictors::matrixImage(-alpha.true)
diff --git a/mvbernoulli/inst/examples/ising_sample.R b/mvbernoulli/inst/examples/ising_sample.R
new file mode 100644
index 0000000..93a5ee7
--- /dev/null
+++ b/mvbernoulli/inst/examples/ising_sample.R
@@ -0,0 +1,90 @@
+# ilustration of the Ising model sampling routine
+
+sym <- function(A) A + t(A)
+
+vech <- function(A) A[lower.tri(A, diag = TRUE)]
+
+vech.inv <- function(a) {
+    # determin original matrix dimensions `p by p`
+    p <- as.integer((sqrt(8 * length(a) + 1) - 1) / 2)
+    # create matrix of dim `p by p` of the same data type as `a`
+    A <- matrix(do.call(typeof(a), list(1)), p, p)
+    # write elements of `a` into the correct positions of `A`
+    A[lower.tri(A, diag = TRUE)] <- a
+    A[upper.tri(A)] <- t(A)[upper.tri(A)]
+    A
+}
+
+flip <- function(A) A[rev(seq_len(nrow(A))), rev(seq_len(ncol(A)))]
+
+# # R calculation of Theta
+# Theta.R <- local({
+#     P <- diag(cond_probs)
+#     PtP <- tcrossprod(P, P)
+#     Theta <- log(((1 - PtP) * cond_probs) / (PtP * (1 - cond_probs)))
+#     diag(Theta) <- log(P / (1 - P))
+#     Theta
+# })
+
+# # ### MatLab computation of Theta
+# # q = 7;
+# # cond_probs = 0.75 .^ (1 + abs((1:q)' - (1:q)));
+# # % compute natural parameter theta
+# # P = cond_probs(sub2ind(size(cond_probs), 1:q, 1:q));
+# # PtP = P' * P;
+# # % first: off diagonal elements
+# # theta = log(((1 - PtP) .* cond_probs) ./ (PtP .* (1 - cond_probs)));
+# # % second: diagonal elements
+# # theta(sub2ind(size(theta), 1:q, 1:q)) = log(P ./ (1 - P));
+
+
+q <- 20
+
+# conditional probabilities
+cond_probs <- 0.75 ^ (1 + abs(outer(1:q, 1:q, `-`)))
+cond_probs[tail(1:q, 3), ] <- 0.1
+cond_probs[, tail(1:q, 3)] <- 0.1
+cond_probs[tail(1:q, 3), tail(1:q, 3)] <- diag(0.4, 3, 3) + 0.1
+
+theta <- ising_theta_from_cond_prob(flip(cond_probs)[lower.tri(cond_probs, diag = TRUE)])
+
+system.time(Y <- ising_sample(10000, theta))
+
+ising_expectation(theta)
+stopifnot(all.equal(mean.mvbinary(Y), colMeans(as.mvbmatrix(Y))))
+
+ising_cov(theta)
+stopifnot(all.equal(cov.mvbinary(Y), cov(as.mvbmatrix(Y))))
+
+
+# covariances
+cov.true <- ising_cov(theta)
+cov.est <- cov.mvbinary(Y)
+
+# marginal probabilities
+mar_probs.true <- flip(vech.inv(ising_marginal_probs(theta)))
+mar_probs.est <- crossprod(as.mvbmatrix(Y)) / length(Y)
+
+par(mfrow = c(2, 3), mar = c(3, 2, 4, 1), oma = c(1, 0, 5, 0))
+tensorPredictors::matrixImage(cond_probs,
+    main = expression(pi == P[theta(pi)](paste(Y[i] == 1, ", ", Y[j] == 1, " | ", Y[paste(-i, ", ", -j)] == 0)))
+)
+tensorPredictors::matrixImage(cov.true,
+    main = expression(cov[theta(pi)](Y)))
+tensorPredictors::matrixImage(cov.est,
+    main = expression(hat(cov)(Y)),
+    sub = paste("Est. Error:", round(norm(cov.true - cov.est, "F"), 3)))
+tensorPredictors::matrixImage(flip(vech.inv(theta)),
+    main = expression({vech^-1}(theta(pi)))
+)
+tensorPredictors::matrixImage(mar_probs.true,
+    main = expression(P[theta(pi)](Y[i] == 1, Y[j] == 1)))
+tensorPredictors::matrixImage(mar_probs.est,
+    main = expression(hat(P)(Y[i] == 1, Y[j] == 1)),
+    sub = paste("Est. Error:", round(norm(mar_probs.true - mar_probs.est, "F"), 3)))
+mtext(bquote(
+    paste(Y, " ~ ", Ising[.(q)](theta(pi)))
+), side = 3, line = 0, outer = TRUE)
+mtext(bquote(
+    paste("Sample size ", n == .(length(Y)))
+), side = 3, line = -1.5, outer = TRUE)
diff --git a/mvbernoulli/inst/examples/ising_sim.R b/mvbernoulli/inst/examples/ising_sim.R
new file mode 100644
index 0000000..7ae4a8d
--- /dev/null
+++ b/mvbernoulli/inst/examples/ising_sim.R
@@ -0,0 +1,102 @@
+
+ising_conditional_fit <- function(X, Y, ..., callback = NULL) {
+    # get and check dimensions
+    n <- if (is.null(nrow(Y))) length(Y) else nrow(Y)
+    p <- ncol(X)
+    q <- if (is.null(ncol(Y))) attr(Y, "p") else ncol(Y)
+    # check dimensions
+    stopifnot(nrow(X) == n)
+
+    ### Initial value estimate
+    # SVD of the predictor covariance estimate `Sigma = U_Sigma D_Sigma U_Sigma'`
+    SigmaSVD <- La.svd(cov(X), min(p, q), 0)
+
+    # Estimate `pi` as the single and two way effect means (approx conditional
+    # probabilities through the marginal probability estimate)
+    pi <- mean.mvbinary(Y, twoway = TRUE)
+
+    # convert conditional probabilities into natural parameters (log-odds)
+    theta <- ising_theta_from_cond_prob(pi)
+
+    # convert natural parameters `theta` to square matrix form `Theta`
+    Theta <- matrix(NA, q, q)
+    Theta[lower.tri(diag(q), diag = TRUE)] <- theta
+    Theta[upper.tri(diag(q))] <- t(Theta)[upper.tri(diag(q))]
+    Theta <- (0.5 + diag(0.5, q, q)) * Theta
+
+    # SVD of `Theta`
+    ThetaSVD <- La.svd(Theta, min(p, q), 0)
+
+    # Finally, initial `alpha` parameter estimate
+    #   `alpha_0 = U_Sigma D_Sigma^-1/2 D_Theta^1/2 U_Theta'`
+    alpha <- with(list(S = SigmaSVD, T = ThetaSVD), {
+        S$u %*% diag(sqrt(T$d[seq_len(min(p, q))] / S$d[seq_len(min(p, q))])) %*% t(T$u)
+    })
+
+    ### Optimize log-likelihood for `alpha`
+    tensorPredictors::NAGD(
+        fun.loss = function(alpha) -ising_conditional_log_likelihood(alpha, X, Y),
+        fun.grad = function(alpha) -ising_conditional_score(alpha, X, Y),
+        params = alpha,
+        ...,
+        callback = callback
+    )
+}
+
+n <- 1000
+p <- 7
+q <- 9
+
+alpha.true <- matrix(rnorm(p * q), p)
+X <- matrix(runif(n * p), n)
+theta <- function(alpha, x) {
+    Theta <- crossprod(crossprod(x, alpha))
+    diag(Theta) <- 0.5 * diag(Theta)
+    2 * Theta[lower.tri(diag(ncol(alpha)), diag = TRUE)]
+}
+# sample Y ~ P( . | X = x) for x in X
+Y <- apply(X, 1, function(x) ising_sample(1, theta(alpha.true, x)))
+attr(Y, "p") <- as.integer(q)
+
+alpha.est <- ising_conditional_fit(X, Y,
+    callback = function(iter, alpha) {
+        cat(sprintf(
+            "%4d - diff: %12.4f - ll: %12.4f\n",
+            iter,
+            min(norm(alpha - alpha.true, "F"), norm(alpha + alpha.true, "F")),
+            ising_conditional_log_likelihood(alpha, X, Y)
+        ))
+    })
+
+
+## 
+par(mfrow = c(3, 3), mar = c(2, 2, 1, 1))
+for (i in 1:9) {
+    tensorPredictors::matrixImage(
+        flip(vech.inv(theta(alpha.true, X[i, ]))),
+        main = paste(round(range(theta(alpha.true, X[i, ])), 3), collapse = " ")
+    )
+}
+
+par(mfrow = c(3, 3), mar = c(2, 2, 1, 1))
+for (i in 1:9) {
+    P <- flip(vech.inv(ising_cond_probs(theta(alpha.true, X[i, ]))))
+    tensorPredictors::matrixImage(
+        round(P, 5), P < .Machine$double.eps | 1 < P + .Machine$double.eps
+    )
+}
+
+par(mfrow = c(3, 3), mar = c(2, 2, 1, 1))
+for (i in 1:3) {
+    tensorPredictors::matrixImage(
+        alpha.true
+    )
+    tensorPredictors::matrixImage(
+        flip(vech.inv(theta(alpha.true, X[i, ]))),
+        main = paste(round(range(theta(alpha.true, X[i, ])), 3), collapse = " ")
+    )
+    P <- flip(vech.inv(ising_cond_probs(theta(alpha.true, X[i, ]))))
+    tensorPredictors::matrixImage(
+        round(P, 5), P < .Machine$double.eps | 1 < P + .Machine$double.eps
+    )
+}
diff --git a/mvbernoulli/inst/include/mvbernoulli.h b/mvbernoulli/inst/include/mvbernoulli.h
new file mode 100644
index 0000000..c013ff3
--- /dev/null
+++ b/mvbernoulli/inst/include/mvbernoulli.h
@@ -0,0 +1,86 @@
+// Included by Rcpp through naming convention into the generated RcppExports.cpp
+// file. This anables to use custom Rcpp types throughout the package.
+
+#ifndef MVBERNOULLI_INCLUDE_GUARD_H
+#define MVBERNOULLI_INCLUDE_GUARD_H
+
+#include <vector>
+#include <algorithm>
+#include <RcppCommon.h>
+
+#include "../../src/types.h"
+
+// Custom type consersion declarations
+namespace Rcpp {
+
+    // from R to C++
+    template <> MVBinary as(SEXP);
+    // from C++ to R
+    template <> SEXP wrap(const MVBinary&);
+
+} /* namespace Rcpp */
+
+#include <Rcpp.h>
+
+// Custom type implementations
+namespace Rcpp {
+
+    // from R to C++
+    template <>
+    MVBinary as(SEXP x) {
+
+        if ((TYPEOF(x) == LGLSXP || TYPEOF(x) == INTSXP) && Rf_isMatrix(x)) {
+            int nrow = Rf_nrows(x);
+            int ncol = Rf_ncols(x);
+
+            if (31 < ncol) {
+                Rcpp::stop("Event dimension too big, max is 31");
+            }
+
+            MVBinary binary(nrow, ncol);
+
+            // convert logical/integer vector to numeric representation
+            int* data = (TYPEOF(x) == LGLSXP) ? LOGICAL(x) : INTEGER(x);
+            for (int i = 0; i < nrow; ++i) {
+                uint32_t num = 0;
+                for (int j = 0; j < ncol; ++j) {
+                    num |= static_cast<bool>(data[i + nrow * j]) * (1 << j);
+                }
+                binary.push_back(num);
+            }
+
+            return binary;
+        } else if ((TYPEOF(x) == INTSXP) && Rf_isVector(x)) {
+            int n = Rf_length(x);
+            SEXP pAttr = Rf_getAttrib(x, Rf_install("p"));
+            int p = -1;
+            if (TYPEOF(pAttr) == INTSXP) {
+                p = Rf_asInteger(pAttr);
+            } else if (TYPEOF(pAttr) == REALSXP) {
+                p = Rf_asInteger(pAttr);
+            } else {
+                Rcpp::stop("Unable to determin ncol (illegal `p` attribute)");
+            }
+            if (p < 2 || 31 < p) {
+                Rcpp::stop("Unable to determin ncol (illegal `p` attribute)");
+            }
+
+            return MVBinary(INTEGER(x), INTEGER(x) + n, p);
+        } else {
+            Rcpp::stop("Unexpected dim/type");
+        }
+
+    }
+
+    // from C++ to R
+    template <>
+    SEXP wrap(const MVBinary& binary) {
+        auto sexp = Rcpp::IntegerVector(binary.begin(), binary.end());
+        sexp.attr("class") = Rcpp::CharacterVector::create("mvbinary");
+        sexp.attr("p") = binary.dim();
+        return sexp;
+    }
+
+} /* namespace Rcpp */
+
+#endif /* MVBERNOULLI_INCLUDE_GUARD_H */
diff --git a/mvbernoulli/inst/include/threadPool.h b/mvbernoulli/inst/include/threadPool.h
new file mode 100644
index 0000000..8389024
--- /dev/null
+++ b/mvbernoulli/inst/include/threadPool.h
@@ -0,0 +1,139 @@
+#ifndef THREADPOOL_INCLUDE_GUARD_H
+#define THREADPOOL_INCLUDE_GUARD_H
+
+#include <iostream>
+#include <vector>
+#include <queue>
+#include <mutex>
+#include <atomic>
+#include <functional>
+#include <condition_variable>
+#include <thread>
+
+// thread pool
+class ThreadPool {
+public:
+    ThreadPool(std::size_t n = std::thread::hardware_concurrency())
+        : _shutdown{false}, _running{0}
+    {
+        // Reserve max nr. of worker space
+        _workers.reserve(n);
+
+        // Launce `n` workers
+        for (std::size_t i = 0; i < n; ++i) {
+            _workers.emplace_back([this, i]() {
+                // setup infinit loop (continue untill told to shut down or
+                // everything is done)
+                for (;;) {
+                    // setup a (skopped) lock to avoid inference
+                    std::unique_lock<std::mutex> lock(_mtx);
+                    // wait untill ether shutdown or work is available
+                    // (wait iff `!_shutdown && _jobs.empty()`)
+                    _pager.wait(lock, [&]() { return _shutdown || !_jobs.empty(); });
+
+                    // in case of shutdown terminate the infinit loop after all
+                    // jobs have been processes
+                    if (_shutdown && _jobs.empty()) {
+                        break;  // releases the lock
+                    }
+
+                    // extract a job from the job queue
+                    auto job = _jobs.front();
+                    _jobs.pop();
+
+                    // increment the running jobs counter (before releasing the
+                    // lock as the number of outstanding and running jobs needs
+                    // to be precise)
+                    _running += 1;
+
+                    // free the lock for other workers
+                    lock.unlock();
+
+                    // execute the job
+                    job();
+
+                    // decrement the running jobs counter
+                    _running -= 1;
+
+                    // and report a done job (to everyone listening)
+                    _callback.notify_all();
+                }
+            });
+        }
+    }
+    ~ThreadPool() {
+        {
+            // set shutdown with a lock to ensure that workers note the change
+            // in setting shutdown!
+            std::lock_guard<std::mutex> lock(_mtx);
+            _shutdown = true;
+        }
+
+        // notify all workers for the change
+        _pager.notify_all();
+
+        // finally join the worker threads into the main thread
+        for (auto& thr : _workers) { thr.join(); }
+    }
+
+    // Add jobs to the job queue
+    template <typename Fun, typename ...Args>
+    void push(Fun&& job, Args&&... args) {
+        // add a new task to the job queue with a lock
+        {
+            std::unique_lock<std::mutex> lock(_mtx);
+            _jobs.push([job, args...]() { job(args...); });
+        }
+
+        // notify one waiting worker that there is work to be done
+        _pager.notify_one();
+    }
+
+    // wait till all jobs have been processes
+    void wait() {
+        // infinit loop till all threads are idle
+        for (;;) {
+            // guard against job queue retriefel
+            std::unique_lock<std::mutex> lock(_mtx);
+
+            // wait for a callback (done job) to check again but only if there
+            // are any jobs to be performed
+            _callback.wait(lock, [&]() { return _jobs.empty() && !_running; });
+
+            if (_jobs.empty() && !_running) {
+                break;
+            }
+
+            // lock released by end of skope
+        }
+    }
+
+    // clears the job queue
+    void clear() {
+        // lock the queue
+        std::lock_guard<std::mutex> lock(_mtx);
+
+        // and swap the jobs queue with an empty queue
+        std::queue<std::function<void()>>().swap(_jobs);
+    }
+
+    // get number of currently running jobs
+    std::size_t running_jobs() { return _running; }
+
+    // get number of queued (waiting for execution) jobs
+    std::size_t queued_jobs() { return _jobs.size(); }
+
+    // get number of worker threads
+    std::size_t workers() { return _workers.size(); }
+
+private:
+    bool _shutdown;
+    std::size_t _running;               // number of running jobs
+    std::vector<std::thread> _workers;
+    std::queue<std::function<void()>> _jobs;
+    std::condition_variable _pager;     // for wayking idle workers
+    std::condition_variable _callback;  // for workers reporting a done job
+    std::mutex _mtx;                    // mutex base for cond. variables
+};
+
+#endif /* THREADPOOL_INCLUDE_GUARD_H */
diff --git a/mvbernoulli/src/Makevars b/mvbernoulli/src/Makevars
new file mode 100644
index 0000000..b62029a
--- /dev/null
+++ b/mvbernoulli/src/Makevars
@@ -0,0 +1 @@
+PKG_CXXFLAGS=-I../inst/include -pthread
diff --git a/mvbernoulli/src/RcppExports.cpp b/mvbernoulli/src/RcppExports.cpp
new file mode 100644
index 0000000..b08aa8b
--- /dev/null
+++ b/mvbernoulli/src/RcppExports.cpp
@@ -0,0 +1,252 @@
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include "../inst/include/mvbernoulli.h"
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+#ifdef RCPP_USE_GLOBAL_ROSTREAM
+Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
+Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
+#endif
+
+// ising_log_odds_sum
+double ising_log_odds_sum(uint32_t y, const VechView& theta);
+RcppExport SEXP _mvbernoulli_ising_log_odds_sum(SEXP ySEXP, SEXP thetaSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< uint32_t >::type y(ySEXP);
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_log_odds_sum(y, theta));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_prob0
+double ising_prob0(const VechView& theta);
+RcppExport SEXP _mvbernoulli_ising_prob0(SEXP thetaSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_prob0(theta));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_probs
+Rcpp::NumericVector ising_probs(const VechView& theta);
+RcppExport SEXP _mvbernoulli_ising_probs(SEXP thetaSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_probs(theta));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_cond_probs
+Rcpp::NumericVector ising_cond_probs(const VechView& theta);
+RcppExport SEXP _mvbernoulli_ising_cond_probs(SEXP thetaSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_cond_probs(theta));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_expectation
+Rcpp::NumericVector ising_expectation(const VechView& theta);
+RcppExport SEXP _mvbernoulli_ising_expectation(SEXP thetaSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_expectation(theta));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_cov
+Rcpp::NumericMatrix ising_cov(const VechView& theta);
+RcppExport SEXP _mvbernoulli_ising_cov(SEXP thetaSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_cov(theta));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_marginal_probs
+Rcpp::NumericVector ising_marginal_probs(const VechView& theta);
+RcppExport SEXP _mvbernoulli_ising_marginal_probs(SEXP thetaSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_marginal_probs(theta));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_theta_from_cond_prob
+Rcpp::NumericVector ising_theta_from_cond_prob(const VechView& pi);
+RcppExport SEXP _mvbernoulli_ising_theta_from_cond_prob(SEXP piSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const VechView& >::type pi(piSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_theta_from_cond_prob(pi));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_log_likelihood
+double ising_log_likelihood(const VechView& theta, const MVBinary& Y);
+RcppExport SEXP _mvbernoulli_ising_log_likelihood(SEXP thetaSEXP, SEXP YSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    Rcpp::traits::input_parameter< const MVBinary& >::type Y(YSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_log_likelihood(theta, Y));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_score
+Rcpp::NumericVector ising_score(const VechView& theta, const MVBinary& Y);
+RcppExport SEXP _mvbernoulli_ising_score(SEXP thetaSEXP, SEXP YSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    Rcpp::traits::input_parameter< const MVBinary& >::type Y(YSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_score(theta, Y));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_conditional_log_likelihood
+double ising_conditional_log_likelihood(const Rcpp::NumericMatrix& alpha, const Rcpp::NumericMatrix& X, const MVBinary& Y);
+RcppExport SEXP _mvbernoulli_ising_conditional_log_likelihood(SEXP alphaSEXP, SEXP XSEXP, SEXP YSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type alpha(alphaSEXP);
+    Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type X(XSEXP);
+    Rcpp::traits::input_parameter< const MVBinary& >::type Y(YSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_conditional_log_likelihood(alpha, X, Y));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_conditional_score
+Rcpp::NumericVector ising_conditional_score(const Rcpp::NumericMatrix& alpha, const Rcpp::NumericMatrix& X, const MVBinary& Y);
+RcppExport SEXP _mvbernoulli_ising_conditional_score(SEXP alphaSEXP, SEXP XSEXP, SEXP YSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type alpha(alphaSEXP);
+    Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type X(XSEXP);
+    Rcpp::traits::input_parameter< const MVBinary& >::type Y(YSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_conditional_score(alpha, X, Y));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_fisher_info
+Rcpp::NumericMatrix ising_fisher_info(const VechView& theta);
+RcppExport SEXP _mvbernoulli_ising_fisher_info(SEXP thetaSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_fisher_info(theta));
+    return rcpp_result_gen;
+END_RCPP
+}
+// ising_sample
+MVBinary ising_sample(const std::size_t n, const VechView& theta, const std::size_t warmup);
+RcppExport SEXP _mvbernoulli_ising_sample(SEXP nSEXP, SEXP thetaSEXP, SEXP warmupSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const std::size_t >::type n(nSEXP);
+    Rcpp::traits::input_parameter< const VechView& >::type theta(thetaSEXP);
+    Rcpp::traits::input_parameter< const std::size_t >::type warmup(warmupSEXP);
+    rcpp_result_gen = Rcpp::wrap(ising_sample(n, theta, warmup));
+    return rcpp_result_gen;
+END_RCPP
+}
+// print_mvbinary
+void print_mvbinary(const MVBinary& binary, int nrLines);
+RcppExport SEXP _mvbernoulli_print_mvbinary(SEXP binarySEXP, SEXP nrLinesSEXP) {
+BEGIN_RCPP
+    Rcpp::traits::input_parameter< const MVBinary& >::type binary(binarySEXP);
+    Rcpp::traits::input_parameter< int >::type nrLines(nrLinesSEXP);
+    print_mvbinary(binary, nrLines);
+    return R_NilValue;
+END_RCPP
+}
+// printBits
+void printBits(const Rcpp::IntegerVector& ints);
+RcppExport SEXP _mvbernoulli_printBits(SEXP intsSEXP) {
+BEGIN_RCPP
+    Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type ints(intsSEXP);
+    printBits(ints);
+    return R_NilValue;
+END_RCPP
+}
+// as_mvbinary
+MVBinary as_mvbinary(const MVBinary& Y);
+RcppExport SEXP _mvbernoulli_as_mvbinary(SEXP YSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const MVBinary& >::type Y(YSEXP);
+    rcpp_result_gen = Rcpp::wrap(as_mvbinary(Y));
+    return rcpp_result_gen;
+END_RCPP
+}
+// as_mvbmatrix
+Rcpp::LogicalMatrix as_mvbmatrix(const MVBinary& Y);
+RcppExport SEXP _mvbernoulli_as_mvbmatrix(SEXP YSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const MVBinary& >::type Y(YSEXP);
+    rcpp_result_gen = Rcpp::wrap(as_mvbmatrix(Y));
+    return rcpp_result_gen;
+END_RCPP
+}
+// mean_mvbinary
+Rcpp::NumericVector mean_mvbinary(const MVBinary& Y, const bool twoway);
+RcppExport SEXP _mvbernoulli_mean_mvbinary(SEXP YSEXP, SEXP twowaySEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const MVBinary& >::type Y(YSEXP);
+    Rcpp::traits::input_parameter< const bool >::type twoway(twowaySEXP);
+    rcpp_result_gen = Rcpp::wrap(mean_mvbinary(Y, twoway));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cov_mvbinary
+Rcpp::NumericMatrix cov_mvbinary(const MVBinary& Y);
+RcppExport SEXP _mvbernoulli_cov_mvbinary(SEXP YSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::traits::input_parameter< const MVBinary& >::type Y(YSEXP);
+    rcpp_result_gen = Rcpp::wrap(cov_mvbinary(Y));
+    return rcpp_result_gen;
+END_RCPP
+}
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_mvbernoulli_ising_log_odds_sum", (DL_FUNC) &_mvbernoulli_ising_log_odds_sum, 2},
+    {"_mvbernoulli_ising_prob0", (DL_FUNC) &_mvbernoulli_ising_prob0, 1},
+    {"_mvbernoulli_ising_probs", (DL_FUNC) &_mvbernoulli_ising_probs, 1},
+    {"_mvbernoulli_ising_cond_probs", (DL_FUNC) &_mvbernoulli_ising_cond_probs, 1},
+    {"_mvbernoulli_ising_expectation", (DL_FUNC) &_mvbernoulli_ising_expectation, 1},
+    {"_mvbernoulli_ising_cov", (DL_FUNC) &_mvbernoulli_ising_cov, 1},
+    {"_mvbernoulli_ising_marginal_probs", (DL_FUNC) &_mvbernoulli_ising_marginal_probs, 1},
+    {"_mvbernoulli_ising_theta_from_cond_prob", (DL_FUNC) &_mvbernoulli_ising_theta_from_cond_prob, 1},
+    {"_mvbernoulli_ising_log_likelihood", (DL_FUNC) &_mvbernoulli_ising_log_likelihood, 2},
+    {"_mvbernoulli_ising_score", (DL_FUNC) &_mvbernoulli_ising_score, 2},
+    {"_mvbernoulli_ising_conditional_log_likelihood", (DL_FUNC) &_mvbernoulli_ising_conditional_log_likelihood, 3},
+    {"_mvbernoulli_ising_conditional_score", (DL_FUNC) &_mvbernoulli_ising_conditional_score, 3},
+    {"_mvbernoulli_ising_fisher_info", (DL_FUNC) &_mvbernoulli_ising_fisher_info, 1},
+    {"_mvbernoulli_ising_sample", (DL_FUNC) &_mvbernoulli_ising_sample, 3},
+    {"_mvbernoulli_print_mvbinary", (DL_FUNC) &_mvbernoulli_print_mvbinary, 2},
+    {"_mvbernoulli_printBits", (DL_FUNC) &_mvbernoulli_printBits, 1},
+    {"_mvbernoulli_as_mvbinary", (DL_FUNC) &_mvbernoulli_as_mvbinary, 1},
+    {"_mvbernoulli_as_mvbmatrix", (DL_FUNC) &_mvbernoulli_as_mvbmatrix, 1},
+    {"_mvbernoulli_mean_mvbinary", (DL_FUNC) &_mvbernoulli_mean_mvbinary, 2},
+    {"_mvbernoulli_cov_mvbinary", (DL_FUNC) &_mvbernoulli_cov_mvbinary, 1},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_mvbernoulli(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/mvbernoulli/src/bit_utils.cpp b/mvbernoulli/src/bit_utils.cpp
new file mode 100644
index 0000000..0f299e5
--- /dev/null
+++ b/mvbernoulli/src/bit_utils.cpp
@@ -0,0 +1,82 @@
+#include "bit_utils.h"
+
+#if defined(__GNUC__) && defined(__BMI2__)
+    #include <x86intrin.h>
+    #include <bmi2intrin.h>     // _pdep_u32
+#endif
+
+
+int bitParity(uint32_t x) {
+#ifdef __GNUC__
+    return __builtin_parity(x);
+#else
+    bool p = static_cast<bool>(x);
+    while (x &= x - 1) {
+        p = !p;
+    }
+    return static_cast<int>(p);
+#endif
+}
+
+int bitCount(uint32_t x) {
+#ifdef __GNUC__
+    return __builtin_popcount(x); // `POPulation COUNT`
+#else
+    int count = 0; // counts set bits
+
+    // increment count until there are no bits set in x
+    for (; x; count++) {
+        x &= x - 1; // unset least significant bit
+    }
+
+    return count;
+#endif
+}
+
+int bitScanLS(uint32_t x) {
+#ifdef __GNUC__
+    return __builtin_ctz(x);    // Count Trailing Zeros
+#else
+    // result storing the Count of Trailing Zeros
+    int ctz = 0;
+
+    // boolean variable storing if a bit has not found (search area is empty)
+    bool empty;
+
+    // logarithmic search for LSB bit index (-1)
+    ctz += (empty = !(x & static_cast<uint32_t>(65535))) << 4;
+    x >>= 16 * empty;
+    ctz += (empty = !(x & static_cast<uint32_t>(  255))) << 3;
+    x >>=  8 * empty;
+    ctz += (empty = !(x & static_cast<uint32_t>(   15))) << 2;
+    x >>=  4 * empty;
+    ctz += (empty = !(x & static_cast<uint32_t>(    3))) << 1;
+    x >>=  2 * empty;
+    ctz += (empty = !(x & static_cast<uint32_t>(    1)));
+
+    return ctz;
+#endif
+}
+
+uint32_t bitDeposit(uint32_t val, uint32_t mask) {
+#if (defined(__GNUC__) && defined(__BMI2__))
+    return _pdep_u32(val, mask);
+#else
+    uint32_t res = 0;
+    for (uint32_t pos = 1; mask; pos <<= 1) {
+        if (val & pos) {
+            res |= mask & -mask;
+        }
+        mask &= mask - 1;
+    }
+    return res;
+#endif
+}
+
+uint32_t bitNextPerm(uint32_t val) {
+    // Sets all least significant 0-bits of val to 1
+    uint32_t t = val | (val - 1);
+    // Next set to 1 the most significant bit to change,
+    // set to 0 the least significant ones, and add the necessary 1 bits.
+    return (t + 1) | (((~t & -~t) - 1) >> (bitScanLS(val) + 1));
+}
diff --git a/mvbernoulli/src/bit_utils.h b/mvbernoulli/src/bit_utils.h
new file mode 100644
index 0000000..85040f7
--- /dev/null
+++ b/mvbernoulli/src/bit_utils.h
@@ -0,0 +1,56 @@
+#ifndef INCLUDE_GUARD_BIT_UTILS_H
+#define INCLUDE_GUARD_BIT_UTILS_H
+
+#include <cstdint>      // uint32_t, uint64_t
+
+/**
+ * Computes the parity of a 32-bit word (0 for even bit count and 1 otherwise)
+ */
+int bitParity(uint32_t x);
+
+/**
+ * Counts the number of set bits (`1`s in binary) in the number `x`
+ */
+int bitCount(uint32_t x);
+
+/**
+ * Gets the index of the LSB (least significant bit) in a 32-bit word
+ *
+ * @condition `x != 0`, for `x == 0` undefined behaviour
+ */
+int bitScanLS(uint32_t x);
+
+/**
+ * 32-bit Parallel DEPosit (aka PDEP)
+ *
+ * Writes the `val` bits into the positions of the set bits in `mask`.
+ *
+ * Example:
+ *  val:   **** **** **** 1.1.
+ * mask:   1... 1... 1... 1...
+ *  res:   1... .... 1... ....
+ */
+uint32_t bitDeposit(uint32_t val, uint32_t mask);
+
+/**
+ * Gets the next lexicographically ordered permutation of an n-bit word.
+ * 
+ * Let `val` be a bit-word with `n` bits set, then this procedire computes a
+ * `n` bit word wich is the next element in the lexicographically ordered
+ * sequence of `n` bit words. For example
+ * 
+ *           val -> bitNextPerm(val)
+ *      00010011 -> 00010101
+ *      00010101 -> 00010110
+ *      00010110 -> 00011001
+ *      00011001 -> 00011010
+ *      00011010 -> 00011100
+ *      00011100 -> 00100011
+ *
+ * @condition `x != 0`, for `x == 0` undefined behaviour due to `bitScanLS`
+ *
+ * see: https://graphics.stanford.edu/~seander/bithacks.html#NextBitPermutation
+ */
+uint32_t bitNextPerm(uint32_t val);
+
+#endif /* BIT_UTILS_INCLUDE_GUARD_H */
diff --git a/mvbernoulli/src/int_utils.cpp b/mvbernoulli/src/int_utils.cpp
new file mode 100644
index 0000000..cb8b55a
--- /dev/null
+++ b/mvbernoulli/src/int_utils.cpp
@@ -0,0 +1,51 @@
+#include "int_utils.h"
+
+#if (defined(__GNUC__) && defined(__BMI2__))
+    #include <x86intrin.h>
+    #include <bmi2intrin.h>     // _pdep_u32
+#endif
+
+int ilog2(uint64_t x) {
+    int log = 0;
+    while (x >>= 1) {
+        log++;
+    }
+    return log;
+}
+
+uint64_t isqrt(uint64_t x) {
+    // implements a binary search
+    uint64_t root = 0;
+    uint64_t left = 0;          // left boundary
+    uint64_t right = x + 1;     // right boundary
+
+    while(left + 1UL < right) {
+        root = (left + right) / 2UL;
+        if (root * root <= x) {
+            left = root;
+        } else {
+            right = root;
+        }
+    }
+    return left;
+}
+
+uint32_t invTriag(uint32_t x) {
+    uint64_t root = isqrt(8UL * static_cast<uint64_t>(x) + 1UL);
+    if (root * root != 8UL * x + 1UL) {
+        return 0;
+    }
+    return (root - 1) / 2;
+}
+
+uint64_t nrSubSets(uint64_t n, uint64_t k) {
+    uint64_t sum = 1, binom = 1;
+
+    for (uint64_t i = 1; i <= k; ++i) {
+        binom *= n--;
+        binom /= i;
+        sum += binom;
+    }
+
+    return sum;
+}
diff --git a/mvbernoulli/src/int_utils.h b/mvbernoulli/src/int_utils.h
new file mode 100644
index 0000000..4099dd5
--- /dev/null
+++ b/mvbernoulli/src/int_utils.h
@@ -0,0 +1,35 @@
+#ifndef INCLUDE_GUARD_INT_UTILS_H
+#define INCLUDE_GUARD_INT_UTILS_H
+
+#include <cstdint>      // uint32_t, uint64_t
+
+/**
+ * Integer logarithm, the biggest power `p` such that `2^p <= x`.
+ */
+int ilog2(uint64_t x);
+
+/**
+ * Integer Square root of `y`, that is `ceiling(sqrt(y))`
+ */
+uint64_t isqrt(uint64_t x);
+
+/**
+ * Inverse to the triangular numbers
+ *
+ * Given a positive number `x = p (p + 1) / 2` it computes `p` if possible.
+ * In case there is no positive integer solution `0` is returned.
+ *
+ * Note: this follows immediately from the quadratic equation.
+ */
+uint32_t invTriag(uint32_t x);
+
+/**
+ * Number of sub-sets (including empty set) of max-size
+ *
+ * It computes, with `binom` beeing the binomial coefficient, the following sum
+ * 
+ *      sum_{i = 0}^k binom(n, i)
+ */
+uint64_t nrSubSets(uint64_t n, uint64_t k);
+
+#endif /* INCLUDE_GUARD_INT_UTILS_H */
diff --git a/mvbernoulli/src/ising_model.cpp b/mvbernoulli/src/ising_model.cpp
new file mode 100644
index 0000000..38f83e5
--- /dev/null
+++ b/mvbernoulli/src/ising_model.cpp
@@ -0,0 +1,812 @@
+#include <Rcpp.h>           // R to C++ binding library
+#include <cmath>            // `std::exp`
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <algorithm>
+#include <bitset>           // mainly for debugging reasons
+
+#include "types.h"          // MVBinary (Multivariate Binary Data)
+#include "bit_utils.h"      // uint32_t, ... and the `bit*` functions
+#include "int_utils.h"      // isqrt, ilog, ...
+
+#include "threadPool.h"
+
+/******************************************************************************/
+/***                              Ising model                               ***/
+/******************************************************************************/
+/**
+ * The Ising model (as a special case of the Multi-Variate Bernoulli) has its
+ * probability mass function (pmf) for a `p` dim. binary vector `y` defined as
+ *
+ *      P(Y = y | theta) = p_0(theta) exp(T(y)' theta)
+ *
+ * with the parameter vector `theta` and a statistic `T` of `y`. The real valued
+ * parameter vector `theta` is of dimension `p (p + 1) / 2` and the statistic
+ * `T` has the same dimensions as a binary vector given by
+ *
+ *      T(y) = vech(y y').
+ *
+ * TODO: continue
+ */
+
+
+// [[Rcpp::export(rng = false)]]
+double ising_log_odds_sum(uint32_t y, const VechView& theta) {
+    // Collects the result `T(theta)' y` (basically the sum of the log odds in
+    // the parameter vector `theta` with a single or two way interaction in `y`
+    double log_odds = 0.0;
+
+    // Iterate over all bits in the event `y`
+    for (; y; y &= y - 1) {
+        // get LSB index
+        int i = bitScanLS(y);
+        // the single effect index in the parameter vector `theta`
+        int base_index = (i * (2 * theta.dim() + 1 - i)) / 2;
+        // add single effect log odds
+        log_odds += theta[base_index];
+        // For all (remaining) other effects add the two way interaction odds
+        for (uint32_t b = y & (y - 1); b; b &= b - 1) {
+            log_odds += theta[base_index + bitScanLS(b) - i];
+        }
+
+    }
+
+    return log_odds;
+}
+
+//' Ising model scaling factor `p_0(theta)` for the Ising model
+//'
+// [[Rcpp::export(rng = false)]]
+double ising_prob0(const VechView& theta) {
+    // Value of the event `(1, ..., 1)`
+    const uint32_t max_event = (static_cast<uint64_t>(1) << theta.dim()) - 1;
+    // Accumulates `p_0(theta)^-1`
+    double sum_0 = 1.0;
+
+    // sum up all event (except `(0, .., 0)`, considured as initial value `1`)
+    for (uint32_t a = 1; a <= max_event; ++a) {
+        sum_0 += exp(ising_log_odds_sum(a, theta));
+    }
+
+    // scaling factor `p_0(theta) = sum_0^-1`
+    return 1.0 / sum_0;
+}
+
+//' Ising model probabilities for every event, this returns a vector of size `2^p`
+//' with indices corresponding to the events binary representation.
+//'
+//' Note: the R indexing leads to adding +1 to every index.
+//'
+// [[Rcpp::export(rng = false)]]
+Rcpp::NumericVector ising_probs(const VechView& theta) {
+    // setup probability vector
+    Rcpp::NumericVector probs(1 << theta.dim());
+
+    // Value of the event `(1, ..., 1)`
+    const uint32_t max_event = (static_cast<uint64_t>(1) << theta.dim()) - 1;
+    // Accumulates `p_0(theta)^-1`
+    double sum_0 = 1.0;
+
+    // set prob for the zero event to `1`, scaled later with all the other events
+    probs[0] = 1.0;
+
+    // sum up all event (except `(0, .., 0)`, considured as initial value `1`)
+    for (uint32_t a = 1; a <= max_event; ++a) {
+        // set and accumulate (unscaled) probabilites
+        sum_0 += (probs[a] = exp(ising_log_odds_sum(a, theta)));
+    }
+
+    // finish scaling factor calculation `p_0(theta) = 1 / sum_0`
+    double prob_0 = 1.0 / sum_0;
+
+    // scale probabilites
+    for (auto& prob : probs) {
+        prob *= prob_0;
+    }
+
+    return probs;
+}
+
+//' Computes the zero conditioned probabilities
+//'      P(Y_i = 1 | Y_-i = 0)
+//' and
+//'      P(Y_i = 1, Y_j = 1 | Y_-i = 0)
+//' from the natural parameters `theta` with matching indexing.
+//'
+//' This is the inverse function of `ising_theta_from_cond_prob`
+//'
+// [[Rcpp::export(rng = false)]]
+Rcpp::NumericVector ising_cond_probs(const VechView& theta) {
+    // setup result of the same size as theta
+    Rcpp::NumericVector pi(theta.size());
+
+    // set random variable dimension
+    std::size_t p = theta.dim();
+
+    // iterate all single effects
+    for (std::size_t i = 0; i < p; ++i) {
+        // compute probs for single effects
+        std::size_t base_index = (i * (2 * p + 1 - i)) / 2;
+        double exp_i = exp(theta[base_index]);
+        // the probability `P(Y_i = 1 | Y_-i = 0)`
+        pi[base_index] = exp_i / (1 + exp_i);
+        // iterate over bigger indexed components interacting with the `i`th one
+        for (std::size_t j = i + 1; j < p; ++j) {
+            // again compute exp(theta_j)
+            double exp_j = exp(theta[(j * (2 * p + 1 - j)) / 2]);
+            // as well as the two way exponent
+            double exp_ij = exp(theta[base_index + (j - i)]);
+            // two way consitional probability `P(Y_i = Y_j = 1 | Y_-i,-j = 0)`
+            pi[base_index + (j - i)] = (exp_i * exp_j * exp_ij)
+                / (1.0 + exp_i + exp_j + exp_i * exp_j * exp_ij);
+        }
+    }
+
+    return pi;
+}
+
+//' Computes the expectation of `Y` under the Ising model with natural parameter
+//' `theta` given component wise by
+//'
+//'      E Y_i = P(Y_i = 1)
+//'
+// [[Rcpp::export(rng = false)]]
+Rcpp::NumericVector ising_expectation(const VechView& theta) {
+    const std::size_t p = theta.dim();  // dim of `Y`
+    const uint32_t max_event = static_cast<uint32_t>(-1) >> (32 - p);
+    double sum_0 = 1.0;                 // accumulates `p_0(theta)^-1`
+    Rcpp::NumericVector mu(p, 0.0);     // `mu = E Y`
+
+    // iterate all 2^p events (except the zero event)
+    for (uint32_t y = 1; y <= max_event; ++y) {
+        // dot product `vech(y y')' theta` for current event
+        double dot = 0;
+        // iterate all bits in `y`
+        for (uint32_t a = y; a; a &= a - 1) {
+            int i = bitScanLS(a);
+            int base_index = (i * (2 * p + 1 - i)) / 2;
+            // add single effects of `y`
+            dot += theta[base_index];
+            // iterate over all (higher indexed) interactions and add then
+            for (uint32_t b = a & (a - 1); b; b &= b - 1) {
+                dot += theta[base_index + bitScanLS(b) - i];
+            }
+        }
+        // compute (unscaled) event `y` probability `exp(T(y)' theta)`
+        double prob_y = exp(dot);
+        sum_0 += prob_y;
+
+        // add current (unscalled) probability to all components of set bits
+        for (uint32_t a = y; a; a &= a - 1) {
+            mu[bitScanLS(a)] += prob_y;
+        }
+    }
+
+    // finalize `E[Y]` by scaling with `p_0 = sum_0^-1`
+    double prob_0 = 1.0 / sum_0;
+    for (auto& mui : mu) {
+        mui *= prob_0;
+    }
+
+    return mu;
+}
+
+//' Computes the covariance (second centered moment) of `Y` under the Ising model
+//' with natural parameter `theta`.
+//'
+//'      cov(Y, Y) = E[Y Y'] - E[Y] E[Y]'
+//'
+// [[Rcpp::export(rng = false)]]
+Rcpp::NumericMatrix ising_cov(const VechView& theta) {
+    const std::size_t p = theta.dim();  // dim of `Y`
+    const uint32_t max_event = static_cast<uint32_t>(-1) >> (32 - p);
+    double sum_0 = 1.0;                 // accumulates `p_0(theta)^-1`
+    Rcpp::NumericMatrix cov(p, p);
+
+    // iterate over all 2^p events (except the zero event)
+    for (std::size_t y = 1; y <= max_event; ++y) {
+        // dot product `vech(y y')' theta` for current event
+        double dot = 0;
+        // iterate all bits in `y`
+        for (uint32_t a = y; a; a &= a - 1) {
+            int i = bitScanLS(a);
+            int base_index = (i * (2 * p + 1 - i)) / 2;
+            // add single effects of `y`
+            dot += theta[base_index];
+            // iterate over all (higher indexed) interactions and add then
+            for (uint32_t b = a & (a - 1); b; b &= b - 1) {
+                dot += theta[base_index + bitScanLS(b) - i];
+            }
+        }
+        // compute (unscaled) event `y` probability `exp(T(y)' theta)`
+        double prob_y = exp(dot);
+        sum_0 += prob_y;
+
+        // sum E[Y, Y'] but still unscaled
+        for (uint32_t a = y; a; a &= a - 1) {
+            int i = bitScanLS(a);
+            for (uint32_t b = y; b; b &= b - 1) {
+                int j = bitScanLS(b);
+                cov[i * p + j] += prob_y;
+            }
+        }
+    }
+
+    // finish computing `E[Y Y']` by scaling with `p_0 = sum_0^-1`
+    double prob_0 = 1.0 / sum_0;
+    for (auto& covij : cov) {
+        covij *= prob_0;
+    }
+
+    // subtract outer product of expectation `-= E[Y] E[Y]'`
+    const auto mu = ising_expectation(theta); // `mu = E[Y]`
+    for (std::size_t j = 0; j < p; ++j) {
+        for (std::size_t i = 0; i < p; ++i) {
+            cov[j * p + i] -= mu[i] * mu[j];
+        }
+    }
+
+    return cov;
+}
+
+//' Computes the single and two way effect marginal probabilities
+//'
+//'      P(Y_i = 1)
+//' and
+//'      P(Y_i Y_j = 1)
+//'
+//' In its vectorized form this function computes E[vech(Y Y')]
+//'
+// [[Rcpp::export(rng = false)]]
+Rcpp::NumericVector ising_marginal_probs(const VechView& theta) {
+    // Step 0: Setup (and validate) variables/parameters
+    const std::size_t p = theta.dim();
+    if (p != theta.dim()) {
+        Rcpp::stop("Parameter dimension does not match data dimension");
+    }
+    const uint32_t max_event = static_cast<uint32_t>(-1) >> (32 - p);
+    double sum_0 = 1.0;  // accumulates p_0(theta)^-1
+    Rcpp::NumericVector score(theta.size(), 0.0); // grad l(theta)
+
+    // Step 1: Calc `-n E[vech(Y Y')]` where the sum over `y` is the sum over
+    // all possible events `y in {0, 1}^p`.
+    for (uint32_t y = 1; y <= max_event; ++y) {
+        // sum of `T(y)' theta` for current instance `y`
+        double log_odds = 0;
+        // iterate all bits in `y`
+        for (uint32_t a = y; a; a &= a - 1) {
+            int i = bitScanLS(a);
+            int base_index = (i * (2 * p + 1 - i)) / 2;
+            // add single effects of `y`
+            log_odds += theta[base_index];
+            // iterate over all (higher indexed) interactions and add then
+            for (uint32_t b = a & (a - 1); b; b &= b - 1) {
+                log_odds += theta[base_index + bitScanLS(b) - i];
+            }
+        }
+        // compute (unscaled) event `y` probability `exp(T(y)' theta)`
+        double prob_y = exp(log_odds);
+        sum_0 += prob_y;
+
+        // at this point we know the (unscaled) probability of the event `y`
+        // which needs to be added to the gradient at the set bit positions
+        // of `y` corresponding to single and interaction effects
+        for (uint32_t a = y; a; a &= a - 1) {
+            int i = bitScanLS(a);
+            int base_index = (i * (2 * p + 1 - i)) / 2;
+            score[base_index] += prob_y;
+            for (uint32_t b = a & (a - 1); b; b &= b - 1) {
+                score[base_index + bitScanLS(b) - i] += prob_y;
+            }
+        }
+    }
+    // finalize `-E[vech(Y Y')]` by scaling with `-p_0 = -sum_0^-1`
+    double prob_0 = 1.0 / sum_0;
+    for (auto& s : score) {
+        s *= prob_0;
+    }
+
+    return score;
+}
+
+//' Natural parameters from the sufficient conditional probability statistis `pi`
+//'
+//' Computes the natural parameters `theta` of the Ising model from zero
+//' conditioned probabilites for single and two way effects.
+//'
+//' This is the inverse function of `ising_cond_prob_from_theta`
+//'
+// [[Rcpp::export(rng = false)]]
+Rcpp::NumericVector ising_theta_from_cond_prob(const VechView& pi) {
+    // initialize natural parameters vector theta
+    Rcpp::NumericVector theta(pi.size());
+
+    // check if given probabilities are in the ragne [0, 1]
+    if (std::any_of(pi.begin(), pi.end(), [](const double prob) {
+        return (prob < 0.0) || (1.0 < prob);
+    })) {
+        Rcpp::stop("`pi` must contain only elements in the range [0, 1]");
+    }
+
+    // get random variable dimension
+    std::size_t p = pi.dim();
+
+    // iterate all single effects
+    for (std::size_t i = 0; i < p; ++i) {
+        // compute single effect theta_i
+        std::size_t base_index = (i * (2 * p + 1 - i)) / 2;
+        theta[base_index] = log(pi[base_index] / (1.0 - pi[base_index]));
+        // iterate all higher indexed interactions with the currecnt effect
+        for (std::size_t j = i + 1; j < p; ++j) {
+            theta[base_index + (j - i)] = log(
+                ((1 - pi(i) * pi(j)) * pi(i, j)) / (pi(i) * pi(j) * (1 - pi(i, j)))
+            );
+        }
+    }
+
+    return theta;
+}
+
+//' Computes the log-lokelihood at natural parameters `theta` of the Ising model
+//' given a set of observations `Y`
+//'
+//'      l(theta) = log(p_0(theta)) + n^-1 sum_i vech(y_i y_i')' theta
+//'
+// [[Rcpp::export(rng = false)]]
+double ising_log_likelihood(const VechView& theta, const MVBinary& Y) {
+    // sum the log odds `sum_i vech(y_i y_i')' theta`
+    double sum = 0.0;
+    for (const uint32_t y : Y) {
+        sum += ising_log_odds_sum(y, theta);
+    }
+
+    // add scaling factor `log(p_0(theta))`
+    return log(ising_prob0(theta)) + (sum / static_cast<double>(Y.size()));
+}
+
+//' Computes the Score of the Ising model, this is the gradiend of the (mean)
+//' log-likelihood with respect to the natural parameters
+//'
+//'      grad l(theta) = -E[vech(Y Y')] + n^-1 sum_i vech(y_i y_i')
+//'
+// [[Rcpp::export(rng = false)]]
+Rcpp::NumericVector ising_score(const VechView& theta, const MVBinary& Y) {
+    const std::size_t p = theta.dim();
+
+    // Step 1: compute -E[vech(Y Y')] (data independent part)
+    auto score = ising_marginal_probs(theta);
+    for (auto& s : score) {
+        s *= -1.0;
+    }
+
+    // Step 2: Add data dependend part `mean_i vech(y_i y_i')`
+    const double n_inv = 1.0 / static_cast<double>(Y.size());
+    for (const uint32_t y : Y) {
+        // start by iterating the single effects in `y` (LSB)
+        for (uint32_t a = y; a; a &= a - 1) {
+            int i = bitScanLS(a);
+            int base_index = (i * (2 * p + 1 - i)) / 2;
+            // add single effects
+            score[base_index] += n_inv;
+            // and the two way interaction effects
+            for (uint32_t b = a & (a - 1); b; b &= b - 1) {
+                score[base_index + bitScanLS(b) - i] += n_inv;
+            }
+        }
+    }
+
+    return score;
+}
+
+/**
+ * Overload of `ising_score` for a single observation `y`
+ */
+Rcpp::NumericVector ising_score(const VechView& theta,
+    const uint32_t y, const std::size_t p
+) {
+    // Step 1: compute -E[vech(Y Y')] (data independent part)
+    auto score = ising_marginal_probs(theta);
+    for (auto& s : score) {
+        s *= -1.0;
+    }
+
+    // Step 2: Add data dependend part `vech(y y')`
+    // start by iterating the single effects in `y` (LSB)
+    for (uint32_t a = y; a; a &= a - 1) {
+        int i = bitScanLS(a);
+        int base_index = (i * (2 * p + 1 - i)) / 2;
+        // add single effects
+        score[base_index] += 1.0;
+        // and the two way interaction effects
+        for (uint32_t b = a & (a - 1); b; b &= b - 1) {
+            score[base_index + bitScanLS(b) - i] += 1.0;
+        }
+    }
+
+    return score;
+}
+
+/**
+ * Computes the log-likelihood at natural parameters `theta` of the Ising model
+ * given a set of observations `Y`
+ *
+ *      l(alpha) = n^-1 sum_i (log(p_0(alpha, x_i)) + (x_i' alpha y_i)^2)
+ *               = n^-1 sum_i (log(p_0(theta_alpha(x_i))) + vech(y_i y_i')'theta_alpha(x_i))
+ */
+// [[Rcpp::export(rng = false)]]
+double ising_conditional_log_likelihood(const Rcpp::NumericMatrix& alpha,
+    const Rcpp::NumericMatrix& X, const MVBinary& Y
+) {
+    // get probem dimensions
+    const std::size_t p = alpha.nrow();
+    const std::size_t q = alpha.ncol();
+
+    // check parameter dimensions
+    if (X.nrow() != Y.size() || X.ncol() != p || Y.dim() != q) {
+        Rcpp::stop("Parameter dimension missmatch");
+    }
+
+    // natural parameter for the conditional Ising model
+    //      `theta_alpha(x) = vech((2 1_q 1_q' - I_q) o Theta_alpha(x))`
+    // with `o` denoting the Hadamart product.
+    VechView theta(q);
+    // temp inbetween variables
+    Rcpp::NumericVector z(q);
+
+    // sum over all observations
+    double ll = 0.0;
+    for (std::size_t i = 0; i < Y.size(); ++i) {
+        // get i'th observation (response `y` and predictor `x`)
+        const uint32_t y = Y[i];
+        const auto x = X.row(i);
+
+        // compute natural parameter from `alpha` with current predictor `x`
+        // First `z = alpha' x_i` and `s = y_i' alpha' x_i`
+        double s = 0.0;
+        for (std::size_t j = 0; j < q; ++j) {
+            z[j] = 0.0;
+            for (std::size_t k = 0; k < p; ++k) {
+                z[j] += x[k] * alpha[j * p + k];
+            }
+            s += static_cast<bool>(y & (1 << j)) * z[j];
+        }
+        // and then `vech` of the outer product `z z'` with off diagonal
+        // elements multiplied by `2`. (See: `Theta` to `theta` relation)
+        // Explicitly (`theta = theta_alpha(x)`):
+        //      `theta = vech((2 1_q 1_q' - I_q) o (alpha' x x' alpha))`
+        // where `o` is the Hadamard pdoruct.
+        for (std::size_t j = 0; j < q; ++j) {
+            theta(j, j) = z[j] * z[j];
+            for (std::size_t k = j + 1; k < q; ++k) {
+                theta(j, k) = 2.0 * z[j] * z[k];
+            }
+        }
+
+        // add to log-lilelihood sum
+        ll += log(ising_prob0(theta)) + s * s;
+    }
+
+    return ll / static_cast<double>(Y.size());
+}
+
+/**
+ * Comutes the Score for the conditional Ising model
+ *
+ *      P(Y = y | X = x) = p_0(alpha) exp(y' Theta_alpha(x) y)
+ *
+ * with the parameter relation
+ *
+ *      Theta_alpha(x) = alpha' x x' alpha.
+ *
+ * The computed Score has the form
+ *
+ *      grad l(alpha) =
+ *          2 n^-1 sum_i x_i x_i' alpha (-E[vec(Y Y') | X = x_i] + vec(y_i y_i'))
+ */
+// [[Rcpp::export(rng = false)]]
+Rcpp::NumericVector ising_conditional_score(const Rcpp::NumericMatrix& alpha,
+    const Rcpp::NumericMatrix& X, const MVBinary& Y
+) {
+    // get probem dimensions
+    const std::size_t p = alpha.nrow();
+    const std::size_t q = alpha.ncol();
+
+    // check parameter dimensions
+    if (X.nrow() != Y.size() || X.ncol() != p || Y.dim() != q) {
+        Rcpp::stop("Parameter dimension missmatch");
+    }
+
+    // setup the Score (default zero initialized)
+    Rcpp::NumericMatrix score(p, q);
+    // natural parameter for the conditional Ising model
+    //      `theta_alpha(x) = vech((2 1_q 1_q' - I_q) o Theta_alpha(x))`
+    // with `o` denoting the Hadamart product.
+    VechView theta(q);
+    // temp inbetween variables
+    Rcpp::NumericVector z(q);
+
+    // for each observation (iter observation index i = 0, ..., n - 1)
+    for (std::size_t i = 0; i < Y.size(); ++i) {
+        // get i'th observation (response `y` and predictor `x`)
+        const uint32_t y = Y[i];
+        const auto x = X.row(i);
+
+        // compute natural parameter from `alpha` with current predictor `x`
+        // First `z = alpha' x`
+        for (std::size_t j = 0; j < q; ++j) {
+            z[j] = 0.0;
+            for (std::size_t k = 0; k < p; ++k) {
+                z[j] += x[k] * alpha[j * p + k];
+            }
+        }
+        // and then `vech` of the outer product `z z'` with off diagonal
+        // elements multiplied by `2`. (See: `Theta` to `theta` relation)
+        // Explicitly (`theta = theta_alpha(x)`):
+        //      `theta = vech((2 1_q 1_q' - I_q) o (alpha' x x' alpha))`
+        // where `o` is the Hadamard pdoruct.
+        for (std::size_t j = 0; j < q; ++j) {
+            theta(j, j) = z[j] * z[j];
+            for (std::size_t k = j + 1; k < q; ++k) {
+                theta(j, k) = 2.0 * z[j] * z[k];
+            }
+        }
+
+        // With `theta_alpha(x)` compute the classic Ising model Score at
+        // `theta = theta_alpha(x)` for a single observation `y`.
+        //      `S = grad_theta l`
+        const auto S = ising_score(theta, y, q);
+
+        // convert classic ising model Score to conditional Ising model Score
+        // by using:
+        //      `grad_alpha l = 2 x x' alpha vec^-1(D_q grad_theta l)`
+        // where `D_q` is the duplication matrix.
+        // For each column of `grad_alpha l`
+        for (std::size_t k = 0; k < q; ++k) {
+            // compute the `k`th vector matrix product `(z' S)_k` component
+            double zS_k = 0.0;
+            for (std::size_t l = 0; l < q; ++l) {
+                zS_k += z[l] * S[theta.index(std::min(k, l), std::max(k, l))];
+            }
+            // and now add the `k`th column `x (z' S)_k` to the Score
+            for (std::size_t j = 0; j < p; ++j) {
+                score[k * p + j] += x[j] * zS_k;
+            }
+        }
+    }
+
+    return (2.0 / static_cast<double>(X.nrow())) * score;
+}
+
+// TODO: Drop or rewrite to test/fix issue with R/Rcpp multithreading issue,
+//       Apparently I can NOT use Rcpp in conjunction with multiple threads
+//       as the garbage collector stack gets out of sync.
+// SEE: https://stackoverflow.com/questions/42119609/how-to-handle-mismatching-in-calling-convention-in-r-and-c-using-rcpp
+//
+// // [[Rcpp::export(rng = false)]]
+// Rcpp::NumericVector ising_conditional_score_mt(const Rcpp::NumericMatrix& alpha,
+//     const Rcpp::NumericMatrix& X, const MVBinary& Y
+// ) {
+//     // get probem dimensions
+//     const std::size_t p = alpha.nrow();
+//     const std::size_t q = alpha.ncol();
+
+//     // check parameter dimensions
+//     if (X.nrow() != Y.size() || X.ncol() != p || Y.dim() != q) {
+//         Rcpp::stop("Parameter dimension missmatch");
+//     }
+
+//     // setup the Score (default zero initialized)
+//     Rcpp::NumericMatrix score(p, q);
+
+//     // setup a thread pool to perform per observation computations in parallel
+//     ThreadPool pool;
+
+//     // for each observation setup a task to compute the Score this observation
+//     // Score and then add them up
+//     for (std::size_t i = 0; i < Y.size(); ++i) {
+//         // push task to the thread pool, they are executed as soon as possible
+//         pool.push([&alpha, &X, &Y, i, p, q, /*out*/ &score]() {
+
+//             // get i'th observation (response `y` and predictor `x`)
+//             const uint32_t y = Y[i];
+//             const auto x = X.row(i);
+
+//             // natural parameter for the conditional Ising model
+//             //      `theta_alpha(x) = vech((2 1_q 1_q' - I_q) o Theta_alpha(x))`
+//             // with `o` denoting the Hadamart product.
+//             VechView theta(q);
+//             // temp inbetween variables
+//             Rcpp::NumericVector z(q);
+
+//             // compute natural parameter from `alpha` with current predictor `x`
+//             // First `z = alpha' x`
+//             for (std::size_t j = 0; j < q; ++j) {
+//                 z[j] = 0.0;
+//                 for (std::size_t k = 0; k < p; ++k) {
+//                     z[j] += x[k] * alpha[j * p + k];
+//                 }
+//             }
+//             // and then `vech` of the outer product `z z'` with off diagonal
+//             // elements multiplied by `2`. (See: `Theta` to `theta` relation)
+//             // Explicitly (`theta = theta_alpha(x)`):
+//             //      `theta = vech((2 1_q 1_q' - I_q) o (alpha' x x' alpha))`
+//             // where `o` is the Hadamard pdoruct.
+//             for (std::size_t j = 0; j < q; ++j) {
+//                 theta(j, j) = z[j] * z[j];
+//                 for (std::size_t k = j + 1; k < q; ++k) {
+//                     theta(j, k) = 2.0 * z[j] * z[k];
+//                 }
+//             }
+
+//             // With `theta_alpha(x)` compute the classic Ising model Score at
+//             // `theta = theta_alpha(x)` for a single observation `y`.
+//             //      `S = grad_theta l`
+//             const auto S = ising_score(theta, y, q);
+
+//             // convert classic ising model Score to conditional Ising model Score
+//             // by using:
+//             //      `grad_alpha l = 2 x x' alpha vec^-1(D_q grad_theta l)`
+//             // where `D_q` is the duplication matrix.
+//             // For each column of `grad_alpha l`
+//             for (std::size_t k = 0; k < q; ++k) {
+//                 // compute the `k`th vector matrix product `(z' S)_k` component
+//                 double zS_k = 0.0;
+//                 for (std::size_t l = 0; l < q; ++l) {
+//                     zS_k += z[l] * S[theta.index(std::min(k, l), std::max(k, l))];
+//                 }
+//                 // and now add the `k`th column `x (z' S)_k` to the Score
+//                 // Add to output score, `+=` should be atomic (TODO: check this!!!)
+//                 for (std::size_t j = 0; j < p; ++j) {
+//                     score[k * p + j] += x[j] * zS_k;
+//                 }
+//             }
+//         });
+//     }
+
+//     return (2.0 / static_cast<double>(X.nrow())) * score;
+// }
+
+
+//' Fisher information for the natural parameters `theta` under the Ising model.
+//'
+//'      I(theta) = -E(H l(theta) | theta) = Cov(vech(Y Y'), vech(Y Y') | theta)
+//'
+//' where `H l(theta)` is the Hessian of the log-likelihood `l(theta)` defined as
+//'
+//'      l(theta) = n^-1 prod_i P(Y = y_i | theta)
+//'               = log(p_0(theta)) - mean_i exp(vech(y_i y_i')' theta)
+//'
+//' Note that the Fisher information does _not_ depend on data.
+//'
+// [[Rcpp::export(rng = false)]]
+Rcpp::NumericMatrix ising_fisher_info(const VechView& theta) {
+
+    // Step 0: Setup (and validate) variables/parameters
+    const std::size_t p = theta.dim();
+    const std::size_t pp = p * (p + 1) / 2;
+    const uint32_t max_event = static_cast<uint32_t>(-1) >> (32 - p);
+    double sum_0 = 1.0;  // accumulates p_0(theta)^-1
+    // Initialize result covariance matrix (zero initialized)
+    Rcpp::NumericMatrix cov(pp, pp);
+
+    // Compute (unscaled) lower triag part of `E(vech(Y Y') vech(Y Y')' | theta)`
+    for (uint32_t y = 1; y <= max_event; ++y) {
+        // Compute (unscaled) probability `P(Y = y | theta)`
+        const double prob = exp(ising_log_odds_sum(y, theta));
+        // and add to scaling factor accumulator
+        sum_0 += prob;
+
+        // Iterate Fisher information column indices (set bits in vech(y y'))
+        // Those are the LSB lower triag indices of the `a, b` LSBs
+        for (uint32_t a = y; a; a &= a - 1) {
+            const std::size_t i = bitScanLS(a);
+            const std::size_t ti = (i * (2 * p + 1 - i)) / 2;
+
+            for (uint32_t b = a; b; b &= b - 1) {
+                // Fisher info column index
+                const std::size_t col = ti + (bitScanLS(b) - i);
+
+                // Fill (lower triangular) Fisher information row positions
+                // by computing the remaining (higher indexed) of vech(y y')
+                // via the LSB bits of `c, d`
+                // TODO: Slight inaccuracy with doing a bit to much work
+                for (uint32_t c = a; c; c &= c - 1) {
+                    const std::size_t j = bitScanLS(c);
+                    const std::size_t tj = (j * (2 * p + 1 - j)) / 2;
+
+                    for (uint32_t d = c; d; d &= d - 1) {
+                        // Fisher info row index (lower triag part)
+                        const std::size_t row = tj + (bitScanLS(d) - j);
+
+                        // add (unscaled) probability for current event effects
+                        cov[row + pp * col] += prob;
+                    }
+                }
+            }
+        }
+    }
+
+    // finish scaling factor
+    const double p_0 = 1.0 / sum_0;
+
+    // Scale and mirrow lower triag part of `E(vech(Y Y') vech(Y Y')' | theta)`
+    for (std::size_t col = 0; col < pp; col++) {
+        for (std::size_t row = col; row < pp; ++row) {
+            cov[col + pp * row] = (cov[row + pp * col] *= p_0);
+        }
+    }
+
+    // Subtract outer product of the expectation
+    // `cov -= E(vech(Y Y') | theta) E(vech(Y Y') | theta)'`
+    auto score = ising_marginal_probs(theta);
+    std::transform(cov.begin(), cov.end(),
+        Rcpp::outer(score, score, std::multiplies<double>()).begin(),
+        cov.begin(), std::minus<double>());
+
+    return cov;
+}
+
+//' Samples from the Ising model given the natural parameters `theta`
+//'
+// [[Rcpp::export(rng = true)]]
+MVBinary ising_sample(const std::size_t n, const VechView& theta,
+    const std::size_t warmup = 1000
+) {
+    const std::size_t p = theta.dim();
+    const std::size_t max_event = static_cast<uint32_t>(-1) >> (32 - p);
+
+    // for small dimensions we use Rcpp::sample
+    if (p < 5) {
+        // setup complete probability vector for every possible event
+        const auto probs = ising_probs(theta);
+
+        // sample from all possible events given probabilities for each
+        const auto events = Rcpp::sample(1 << p, n, true, probs, false);
+
+        return MVBinary(events.begin(), events.end(), p);
+    }
+    // Else: "non trivial implementation" case: Gibbs sampling
+
+    // RNG for continuous uniform `U[0, 1]`
+    auto runif = Rcpp::stats::UnifGenerator(0, 1);
+
+    // conditional probability `P(Y_i = 1 | Y_-i = y_-i) = exp_i / (1 + exp_i)`
+    // where `exp_i = exp(theta_i + sum_{j != i} y_j theta_{i,j})`
+    auto icond_prob = [&theta](const std::size_t i, const uint32_t y) {
+        double log_odds = theta(i);
+        for (uint32_t a = y & ~(static_cast<uint32_t>(1) << i); a; a &= a - 1) {
+            const std::size_t j = bitScanLS(a);
+            log_odds += theta(std::min(i, j), std::max(i, j));
+        }
+        return 1.0 / (1.0 + exp(-log_odds));
+    };
+
+    // Reserve result data set
+    MVBinary events(n, p);
+
+    // repeat untill `n` samples are drawn
+    while (events.size() < n) {
+        // Initialize sample as vector of independent bernoulli draws with
+        // component wise probability `P(Y_i = 1 | Y_-i = 0)`
+        uint32_t y = 0;
+        for (std::size_t i = 0; i < p; ++i) {
+            y |= static_cast<uint32_t>((runif() * (1.0 + exp(-theta(i)))) < 1.0) << i;
+        }
+
+        // repeated cyclic updating
+        for (std::size_t rep = 0; rep < warmup; ++rep) {
+            for (std::size_t i = 0; i < p; ++i) {
+                // event with the `i`th component `1` and all others `0`
+                const uint32_t ei = static_cast<uint32_t>(1) << i;
+                // sample `i`th bit from `Bernoulli(P(Y_i = 1 | Y_-i = y_-i))`
+                y = (y & ~ei) | (ei * (runif() < icond_prob(i, y)));
+            }
+        }
+
+        // add generated sample to result set
+        events.push_back(y);
+    }
+
+    return events;
+}
diff --git a/mvbernoulli/src/print.cpp b/mvbernoulli/src/print.cpp
new file mode 100644
index 0000000..aa53f98
--- /dev/null
+++ b/mvbernoulli/src/print.cpp
@@ -0,0 +1,53 @@
+#include <iomanip>
+
+#include <Rcpp.h>
+
+#include "types.h"
+
+// [[Rcpp::export(name = "print.mvbinary", rng = false)]]
+void print_mvbinary(const MVBinary& binary, int nrLines = 10) {
+    // Divider bits
+    constexpr uint32_t div = 0x88888888;
+
+    // Get nr of element to be printed
+    nrLines = nrLines < binary.size() ? nrLines : binary.size();
+
+    // from first to nr lines print binary events to console
+    for (int i = 0; i < nrLines; ++i) {
+        uint32_t val = binary[i];
+        Rcpp::Rcout << std::setw(12) << std::right << val << ": ";
+        for (uint32_t j = 0; j < binary.dim(); ++j) {
+            Rcpp::Rcout << ((val & (1 << j)) ? '1' : '.');
+            if (div & (1 << j)) {
+                Rcpp::Rcout << ' ';
+            }
+        }
+        Rcpp::Rcout << '\n';
+    }
+
+    // report skipped entries (if any)
+    if (nrLines < binary.size()) {
+        Rcpp::Rcout << "[ skipping " << (binary.size() - nrLines) << " lines ]\n";
+    }
+
+    Rcpp::Rcout << std::flush;
+}
+
+// [[Rcpp::export(rng = false)]]
+void printBits(const Rcpp::IntegerVector& ints) {
+    // Value with only the left most bit set
+    constexpr uint32_t lmb = 1UL << 31;
+
+    for (uint32_t val : ints) {
+        Rcpp::Rcout << std::setw(12) << std::right << val << ": ";
+        for (int j = 0; j < 8; ++j) {
+            for (int k = 0; k < 4; ++k) {
+                Rcpp::Rcout << ((val & lmb) ? '1' : '.');
+                val <<= 1;
+            }
+            Rcpp::Rcout << ' ';
+        }
+        Rcpp::Rcout << '\n';
+    }
+    Rcpp::Rcout << std::flush;
+}
diff --git a/mvbernoulli/src/stats.cpp b/mvbernoulli/src/stats.cpp
new file mode 100644
index 0000000..2e8dc73
--- /dev/null
+++ b/mvbernoulli/src/stats.cpp
@@ -0,0 +1,115 @@
+/**
+ * Implements statistics like `mean`, `cov` and alike for MVBinary data
+ */
+#include <Rcpp.h>           // R to C++ binding library
+#include <algorithm>
+
+#include "bit_utils.h"      // uint32_t, ... and the `bit*` functions
+#include "types.h"          // MVBinary (Multivariate Binary Data)
+
+//' Converts a logical matrix to a multi variate bernoulli dataset
+//'
+// [[Rcpp::export(rng = false, name = "as.mvbinary")]]
+MVBinary as_mvbinary(const MVBinary& Y) { return Y; }
+
+//' Converts a Multivariate binary data set into a logical matrix
+//'
+// [[Rcpp::export(rng = false, name = "as.mvbmatrix")]]
+Rcpp::LogicalMatrix as_mvbmatrix(const MVBinary& Y) {
+    Rcpp::LogicalMatrix mat(Y.nrow(), Y.ncol());
+
+    for (std::size_t i = 0; i < Y.nrow(); ++i) {
+        for (uint32_t a = Y[i]; a; a &= a - 1) {
+            mat[bitScanLS(a) * Y.nrow() + i] = true;
+        }
+    }
+
+    return mat;
+}
+
+//' Mean for a multi variate bernoulli dataset `MVBinary`
+//'
+//'      mean_i y_i                  # twoway = false (only single effects)
+//'
+//' or
+//'
+//'      mean_i vech(y_i y_i')       # twoway = true (with two-way interactions)
+//'
+// [[Rcpp::export(rng = false, name = "mean.mvbinary")]]
+Rcpp::NumericVector mean_mvbinary(const MVBinary& Y, const bool twoway = false) {
+    if (!twoway) {
+        // mean initialized as `p` dim zero vector
+        Rcpp::NumericVector mean(Y.dim());
+
+        // setup scaling factor `1 / n`
+        const double inv_n = 1.0 / static_cast<double>(Y.size());
+
+        // iterate all events
+        for (const auto& y : Y) {
+            // and add set features
+            for (auto a = y; a; a &= a - 1) {
+                mean[bitScanLS(a)] += inv_n;
+            }
+        }
+
+        return mean;
+    } else {
+        // Including two-way interactions
+        Rcpp::NumericVector mean(Y.dim() * (Y.dim() + 1) / 2);
+
+        // get binary vector dimension
+        const int p = Y.dim();
+
+        // iterate all events
+        for (const auto& y : Y) {
+            // iterate event features
+            for (auto a = y; a; a &= a - 1) {
+                int i = bitScanLS(a);
+                int base_index = (i * (2 * p + 1 - i)) / 2;
+                // add single effect
+                mean[base_index] += 1.0;
+                // iterate event two way effects
+                for (auto b = a & (a - 1); b; b &= b - 1) {
+                    // and add the two way effect
+                    mean[base_index + bitScanLS(b) - i] += 1.0;
+                }
+            }
+        }
+
+        // counts scaled by sample size
+        return mean / static_cast<double>(Y.size());
+    }
+}
+
+//' Covariance for multi variate binary data `MVBinary`
+//'
+//'      cov(Y) = (n - 1)^-1 sum_i (y_i - mean(Y)) (y_i - mean(Y))'
+//'
+// [[Rcpp::export(rng = false, name = "cov.mvbinary")]]
+Rcpp::NumericMatrix cov_mvbinary(const MVBinary& Y) {
+    // get random variable dimension
+    const std::size_t p = Y.dim();
+
+    // initialize covariance (default zero initialized)
+    Rcpp::NumericMatrix cov(p, p);
+
+    // step 1: compute the mean (in reversed internal order)
+    const auto mean = mean_mvbinary(Y);
+
+    // iterate all events in `Y`
+    for (const auto& y : Y) {
+        for (std::size_t j = 0; j < p; ++j) {
+            for (std::size_t i = 0; i < p; ++i) {
+                cov[i + p * j] += (static_cast<bool>(y & (1 << i)) - mean[i])
+                                * (static_cast<bool>(y & (1 << j)) - mean[j]);
+            }
+        }
+    }
+
+    // scale by `1 / (n - 1)`
+    const double inv_nm1 = 1.0 / static_cast<double>(Y.size() - 1);
+    std::transform(cov.begin(), cov.end(), cov.begin(),
+        [inv_nm1](const double c) { return c * inv_nm1; });
+
+    return cov;
+}
diff --git a/mvbernoulli/src/types.h b/mvbernoulli/src/types.h
new file mode 100644
index 0000000..71af5b8
--- /dev/null
+++ b/mvbernoulli/src/types.h
@@ -0,0 +1,111 @@
+#ifndef TYPES_INCLUDE_GUARD_H
+#define TYPES_INCLUDE_GUARD_H
+
+#include <Rinternals.h>
+#include <cstdint>
+#include <vector>
+#include <stdexcept>
+
+#include "int_utils.h"
+
+/**
+ * Multivariate Binary Dataset
+ *
+ * Note: The maximum binary vector size for one observation is 32.
+ */
+class MVBinary : public std::vector<uint32_t> {
+public:
+    MVBinary(std::size_t n, std::size_t p) : p{p} {
+        this->reserve(n);
+    }
+    template<typename _InputIterator>
+    MVBinary(_InputIterator first, _InputIterator last, std::size_t p)
+        : p{p}
+        , std::vector<uint32_t>(first, last) { };
+
+    std::size_t dim() const { return p; }
+    std::size_t nrow() const { return size(); }
+    std::size_t ncol() const { return p; }
+
+private:
+    std::size_t p = 0;
+};
+
+/**
+ * View to a SEXP numeric vector representing a half vectorized matrix.
+ *
+ * This means that there esists a `p` such that the length is `p (p + 1) / 2`.
+ */
+class VechView {
+public:
+
+    VechView(const std::size_t p)
+        : _sexp{nullptr}
+        , _size{p * (p + 1) / 2}
+        , _dim{p}
+        , _data{new double[_size]} { }
+
+    // Only ctor, its a SEXP view
+    VechView(SEXP x) : _sexp{x} {
+        // check type
+        if (TYPEOF(x) != REALSXP) {
+            throw std::invalid_argument("expected numeric vector");
+        }
+        // get size, compute underlying dimension and validate size consistency
+        // which is that `_size = length(x) = _dim (_dim + 1) / 2`.
+        _size = Rf_length(x);
+        if (!(_dim = invTriag(_size))) {
+            throw std::invalid_argument("Expected `length(theta) == p * (p + 1) / 2`");
+        }
+        // set data memory hook
+        _data = REAL(x);
+    }
+    // // for now, do not allow to use `Rcpp::wrap`
+    // operator SEXP()       { return _sexp; };
+    // operator SEXP() const { return _sexp; };
+
+    // dtor in case of owning the data instead of beeing a view to a SEXP the
+    // aquired memory needs to be free
+    ~VechView() {
+        if (!_sexp) {
+            delete[] _data;
+        }
+    }
+
+    std::size_t size() const { return _size; }
+    std::size_t dim()  const { return _dim; }
+
+    std::size_t index(std::size_t i, std::size_t j) const {
+        return (i * (2 * _dim - 1 - i)) / 2 + j;
+    }
+    std::size_t index(std::size_t i) const { return index(i, i); }
+
+    double  operator[](std::size_t i) const { return _data[i]; }
+    double& operator[](std::size_t i)       { return _data[i]; }
+    double  operator()(std::size_t i, std::size_t j) const {
+        return _data[(i * (2 * _dim - 1 - i)) / 2 + j];
+    }
+    double& operator()(std::size_t i, std::size_t j)       {
+        return _data[(i * (2 * _dim - 1 - i)) / 2 + j];
+    }
+    double  operator()(std::size_t i) const { return (*this)(i, i); }
+    double& operator()(std::size_t i)       { return (*this)(i, i); }
+
+          double*  begin()       { return _data; }
+          double*  end()         { return _data + _size; }
+    const double*  begin() const { return _data; }
+    const double*  end()   const { return _data + _size; }
+    const double* cbegin()       { return _data; }
+    const double* cend()         { return _data + _size; }
+
+private:
+    SEXP _sexp;         // original R object of which this is a view to the data
+    std::size_t _size;  // length of _data
+    std::size_t _dim;   // dimension of references _data, relation to _size
+                        // is given by `2 _size = _dim (_dim + 1)`
+    double* _data;      // pointer to underlying data
+    // bool _owner = false;// set to true if the SEXP object is created and needs
+    //                     // to be unprotected (from the GC) at destruction
+};
+
+#endif /* TYPES_INCLUDE_GUARD_H */
diff --git a/sim/normal.R b/sim/normal.R
new file mode 100644
index 0000000..077ebd2
--- /dev/null
+++ b/sim/normal.R
@@ -0,0 +1,165 @@
+library(tensorPredictors)
+
+set.seed(314159265, "Mersenne-Twister", "Inversion", "Rejection")
+
+### simulation configuration
+reps <- 100                     # number of simulation replications
+sample.sizes <- c(100, 200, 300, 500, 750)  # sample sizes `n`
+N <- 2000                       # validation set size
+p <- c(2, 3, 5)                 # preditor dimensions
+q <- c(1, 2, 3)                 # functions of y dimensions (response dimensions)
+
+# initial consistency checks
+stopifnot(exprs = {
+    length(p) == length(q)
+    all(outer(p, sample.sizes, `<`))
+})
+
+# setup model parameters
+alphas <- Map(matrix, Map(rnorm, p * q), p)                     # reduction matrices
+Omegas <- Map(function(pj) 0.5^abs(outer(1:pj, 1:pj, `-`)), p)  # mode scatter
+eta1 <- 0                                                       # intercept
+
+# data sampling routine
+sample.data <- function(n, eta1, alphas, Omegas, sample.axis = length(alphas) + 1L) {
+    r <- length(alphas)                                         # tensor order
+
+    # generate response (sample axis is last axis)
+    y <- sample.int(prod(q), n, replace = TRUE)                 # uniform samples
+    Fy <- array(outer(seq_len(prod(q)), y, `==`), dim = c(q, n))
+    Fy <- Fy - c(rowMeans(Fy, dims = r))
+
+    # sample predictors as X | Y = y (sample axis is last axis)
+    Deltas <- Map(solve, Omegas)                                # normal covariances
+    mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)              # conditional mean
+    X <- mu_y + rtensornorm(n, 0, Deltas, r + 1L)               # responses X
+
+    # permute axis to requested get the sample axis
+    if (sample.axis != r + 1L) {
+        perm <- integer(r + 1L)
+        perm[sample.axis] <- r + 1L
+        perm[-sample.axis] <- seq_len(r)
+        X <- aperm(X, perm)
+        Fy <- aperm(Fy, perm)
+    }
+
+    list(X = X, Fy = Fy, sample.axis = sample.axis)
+}
+
+# projection matrix `P_A` as a projection onto the span of `A`
+proj <- function(A) tcrossprod(A, A %*% solve(crossprod(A, A)))
+
+### Logging Errors and Warnings
+# Register a global warning and error handler for logging warnings/errors with
+# current simulation repetition session informatin allowing to reproduce problems
+exceptionLogger <- function(ex) {
+    # retrieve current simulation repetition information
+    rep.info <- get("rep.info", envir = .GlobalEnv)
+    # setup an error log file with the same name as `file`
+    log <- paste0(rep.info$file, ".log")
+    # Write (append) condition message with reproduction info to the log
+    cat("\n\n------------------------------------------------------------\n",
+        sprintf("file <- \"%s\"\nn <- %d\nrep <- %d\n.Random.seed <- c(%s)\n%s\nTraceback:\n",
+            rep.info$file, rep.info$n, rep.info$rep,
+            paste(rep.info$.Random.seed, collapse = ","),
+            as.character.error(ex)
+    ), sep = "", file = log, append = TRUE)
+    # add Traceback (see: `traceback()` which the following is addapted from)
+    n <- length(x <- .traceback(NULL, max.lines = -1L))
+    if (n == 0L) {
+        cat("No traceback available", "\n", file = log, append = TRUE)
+    } else {
+        for (i in 1L:n) {
+            xi <- x[[i]]
+            label <- paste0(n - i + 1L, ": ")
+            m <- length(xi)
+            srcloc <- if (!is.null(srcref <- attr(xi, "srcref"))) {
+                srcfile <- attr(srcref, "srcfile")
+                paste0(" at ", basename(srcfile$filename), "#", srcref[1L])
+            }
+            if (isTRUE(attr(xi, "truncated"))) {
+                xi <- c(xi, " ...")
+                m <- length(xi)
+            }
+            if (!is.null(srcloc)) {
+                xi[m] <- paste0(xi[m], srcloc)
+            }
+            if (m > 1) {
+                label <- c(label, rep(substr("          ", 1L,
+                  nchar(label, type = "w")), m - 1L))
+            }
+            cat(paste0(label, xi), sep = "\n", file = log, append = TRUE)
+        }
+    }
+}
+globalCallingHandlers(list(
+    message = exceptionLogger, warning = exceptionLogger, error = exceptionLogger
+))
+
+
+### for every sample size
+start <- format(Sys.time(), "%Y%m%dT%H%M")
+for (n in sample.sizes) {
+    ### write new simulation result file
+    file <- paste0(paste("sim-normal", start, n, sep = "-"), ".csv")
+    # CSV header, used to ensure correct value collumn mapping when writing to file
+    header <- c(
+        "dist.subspace.gmlm", "dist.subspace.hopca", "dist.subspace.pca",
+        "dist.projection.gmlm", "dist.projection.hopca", "dist.projection.pca",
+        "error.pred.gmlm", "error.pred.hopca", "error.pred.pca"
+    )
+    cat(paste0(header, collapse = ","), "\n", sep = "", file = file)
+
+    ### repeated simulation
+    for (rep in seq_len(reps)) {
+        ### Repetition session state info
+        # Stores specific session variables before starting the current
+        # simulation replication. This allows to log state information which
+        # can be used to replicate a specific simulation repetition in case of
+        # errors/warnings from the logs
+        rep.info <- list(n = n, rep = rep, file = file, .Random.seed = .Random.seed)
+
+        ### sample (training) data
+        c(X, Fy, sample.axis) %<-% sample.data(n, eta1, alphas, Omegas)
+
+        ### Fit data using different methods
+        fit.gmlm <- GMLM.default(X, Fy, sample.axis = sample.axis)
+        fit.hopca <- HOPCA(X, npc = q, sample.axis = sample.axis)
+        fit.pca <- prcomp(mat(X, sample.axis), rank. = prod(q))
+
+        ### Compute Reductions `B.*` where `B.*` spans the reduction subspaces
+        B.true <- Reduce(`%x%`, rev(Map(`%*%`, Omegas, alphas)))
+        B.gmlm <- with(fit.gmlm, Reduce(`%x%`, rev(Map(`%*%`, Omegas, alphas))))
+        B.hopca <- Reduce(`%x%`, rev(fit.hopca))
+        B.pca <- fit.pca$rotation
+
+        # Subspace Distances: Normalized `|| P_A - P_B ||_F` where
+        #   `P_A = A (A' A)^-1/2 A'` and the normalization means that with
+        #   respect to the dimensions of `A, B` the subspace distance is in the
+        #   range `[0, 1]`.
+        dist.subspace.gmlm  <- dist.subspace(B.true, B.gmlm,  normalize = TRUE)
+        dist.subspace.hopca <- dist.subspace(B.true, B.hopca, normalize = TRUE)
+        dist.subspace.pca   <- dist.subspace(B.true, B.pca,   normalize = TRUE)
+
+        # Projection Distances: Spectral norm (2-norm) `|| P_A - P_B ||_2`.
+        dist.projection.gmlm  <- dist.projection(B.true, B.gmlm)
+        dist.projection.hopca <- dist.projection(B.true, B.hopca)
+        dist.projection.pca   <- dist.projection(B.true, B.pca)
+
+        ### Prediction Errors: (using new independend sample of size `N`)
+        c(X, Fy, sample.axis) %<-% sample.data(N, eta1, alphas, Omegas)
+        # centered model matrix of vectorized `X`s
+        vecX <- scale(mat(X, sample.axis), center = TRUE, scale = FALSE)
+        P.true <- proj(B.true)
+        error.pred.gmlm  <- norm(P.true - proj(B.gmlm),  "2")
+        error.pred.hopca <- norm(P.true - proj(B.hopca), "2")
+        error.pred.pca   <- norm(P.true - proj(B.pca),   "2")
+
+        # format estimation/prediction errors and write to file and console
+        line <- paste0(Map(get, header), collapse = ",")
+        cat(line, "\n", sep = "", file = file, append = TRUE)
+        # report progress
+        cat(sprintf("sample size: %d/%d - rep: %d/%d\n",
+            which(n == sample.sizes), length(sample.sizes), rep, reps))
+    }
+}
diff --git a/simulations/eeg_sim.R b/simulations/eeg_sim.R
index 72e9d85..f396ef1 100644
--- a/simulations/eeg_sim.R
+++ b/simulations/eeg_sim.R
@@ -50,9 +50,25 @@ npcs <- list(c(3, 4), c(15, 15), c(20, 30), dim(X)[-1])
 # setup methods for simulation (with unified API)
 methods <- list(
     hopca             = list(
-        fun = function(X, Fy) list(alphas = hopca(X, npc = c(1L, 1L), 1L)),
+        fun = function(X, Fy) list(alphas = HOPCA(X, npc = c(1L, 1L), 1L)),
         is.applicable = function(npc) all(npc == c(256L, 64L))  # NOT reduced
     ),
+    hopir.ls.icu      = list(
+        fun = function(X, Fy) HOPIR(X, Fy, sample.axis = 1L, method = "ls", algorithm = "icu"),
+        is.applicable = function(npc) TRUE
+    ),
+    hopir.mle.icu     = list(
+        fun = function(X, Fy) HOPIR(X, Fy, sample.axis = 1L, method = "mle", algorithm = "icu"),
+        is.applicable = function(npc) TRUE
+    ),
+    hopir.ls.nagd      = list(
+        fun = function(X, Fy) HOPIR(X, Fy, sample.axis = 1L, method = "ls", algorithm = "nagd"),
+        is.applicable = function(npc) TRUE
+    ),
+    hopir.mle.nagd     = list(
+        fun = function(X, Fy) HOPIR(X, Fy, sample.axis = 1L, method = "mle", algorithm = "nagd"),
+        is.applicable = function(npc) TRUE
+    ),
     kpir.base         = list(
         fun = toNewAPI(kpir.base),
         is.applicable = function(npc) prod(npc) < 100
@@ -74,7 +90,7 @@ methods <- list(
             res <- LSIR(matrix(X, nrow(X)), Fy, dim(X)[2], dim(X)[3])
             list(alphas = list(res$beta, res$alpha))
         },
-        is.applicable = function(npc) prod(npc) < 1000
+        is.applicable = function(npc) TRUE
     ),
     kpir.momentum.vlp = list(
         fun = toNewAPI(function(X, Fy) kpir.momentum(X, Fy, init.method = "vlp")),
@@ -107,7 +123,7 @@ for (npc in npcs) {
     if (any(npc < dim(X)[-1])) {
         # Reduce dimensions using (2D)^2 PCA, which is a special case of the Higher
         # Order Principal Component Analysis
-        pcs <- hopca(X, npc = npc, sample.axis = 1)
+        pcs <- HOPCA(X, npc = npc, sample.axis = 1)
         # Reduce dimensions
         X.pc <- mlm(X, Map(t, pcs), modes = 2:3)
     } else {
@@ -185,6 +201,7 @@ for (npc in npcs) {
 # sim <- readRDS("eeg_sim_<date-time>.rds")
 # sim <- readRDS("eeg_sim_20220524T2100.rds")
 # sim <- readRDS("eeg_sim_20220525T1700.rds")
+# sim <- readRDS("eeg_sim_20220628T1222.rds")
 
 metrics <- list(
     # acc: Accuracy. P(Yhat = Y). Estimated as: (TP+TN)/(P+N).
@@ -222,31 +239,58 @@ times <- aggregate(cbind(elapsed, sys.self, user.self) ~ method + npc, sim, medi
 
 print(times, digits = 2)
 
-## stats: 2022.05.24
+## stats: 2022.05.24 + 2022.06.14
 #               method       npc  Acc  Err  FPR  TPR  FNR  TNR  AUC sd(AUC)
 # 1          kpir.base    (3, 4) 0.70 0.30 0.60 0.87 0.13 0.40 0.75   0.047
 # 2       kpir.new.vlp    (3, 4) 0.70 0.30 0.60 0.87 0.13 0.40 0.75   0.047
 # 3        kpir.new.ls    (3, 4) 0.74 0.26 0.51 0.88 0.12 0.49 0.77   0.045
 # 4            kpir.ls    (3, 4) 0.75 0.25 0.49 0.88 0.12 0.51 0.78   0.044
 #          (*) kpir.ls    (3, 4) 0.78 0.22 0.38 0.87 0.13 0.62 0.86   0.034
+#     (*) hopir.ls.icu    (3, 4) 0.80 0.20 0.33 0.87 0.13 0.67 0.85   0.036
+#    (*) hopir.mle.icu    (3, 4) 0.80 0.20 0.33 0.87 0.13 0.67 0.85   0.036
 # 5  kpir.momentum.vlp    (3, 4) 0.70 0.30 0.60 0.87 0.13 0.40 0.75   0.047
 # 6   kpir.momentum.ls    (3, 4) 0.70 0.30 0.58 0.87 0.13 0.42 0.76   0.046
 # 7    kpir.approx.vlp    (3, 4) 0.68 0.32 0.62 0.86 0.14 0.38 0.74   0.048
 # 8     kpir.approx.ls    (3, 4) 0.73 0.27 0.53 0.88 0.12 0.47 0.78   0.044
+#            (**) LSIR    (3, 4) 0.80 0.20 0.36 0.88 0.12 0.64 0.85   0.036
 # 9            kpir.ls  (15, 15) 0.75 0.25 0.47 0.87 0.13 0.53 0.78   0.044
 #          (*) kpir.ls  (15, 15) 0.76 0.24 0.44 0.88 0.12 0.56 0.83   0.039
 # 10    kpir.approx.ls  (15, 15) 0.73 0.27 0.51 0.87 0.13 0.49 0.78   0.044
 # 11           kpir.ls  (20, 30) 0.75 0.25 0.47 0.87 0.13 0.53 0.78   0.044
 #          (*) kpir.ls  (20, 30) 0.77 0.23 0.36 0.84 0.16 0.64 0.79   0.045
+#     (*) hopir.ls.icu  (15, 15) 0.79 0.21 0.38 0.88 0.12 0.62 0.83   0.041
+#    (*) hopir.mle.icu  (15, 15) 0.79 0.21 0.38 0.88 0.12 0.62 0.83   0.041
+#            (**) LSIR  (15, 15) 0.72 0.28 0.44 0.82 0.18 0.56 0.81   0.040
+#     (*) hopir.ls.icu  (20, 30) 0.75 0.25 0.38 0.83 0.17 0.62 0.80   0.045
+#    (*) hopir.mle.icu  (20, 30) 0.75 0.25 0.38 0.83 0.17 0.62 0.80   0.045
 # 12    kpir.approx.ls  (20, 30) 0.63 0.37 1.00 1.00 0.00 0.00 0.51   0.053
+#            (**) LSIR  (20, 30) 0.79 0.21 0.36 0.87 0.13 0.64 0.83   0.038
 # 13           kpir.ls (256, 64) 0.75 0.25 0.44 0.87 0.13 0.56 0.78   0.044
 #          (*) kpir.ls (256, 64) 0.68 0.32 0.51 0.79 0.21 0.49 0.66   0.054
+#     (*) hopir.ls.icu (256, 64) 0.67 0.33 0.53 0.79 0.21 0.47 0.69   0.052
+#    (*) hopir.mle.icu (256, 64) 0.67 0.33 0.53 0.79 0.21 0.47 0.69   0.052
 # 14    kpir.approx.ls (256, 64) 0.75 0.25 0.44 0.87 0.13 0.56 0.78   0.044
 #
-# (*) Using reduction matrices `Map(solve, sdr$Deltas, sdr$alphas)` instead
-#     of only `sdr$alpha`.
+#  (*) Using reduction matrices `Map(solve, sdr$Deltas, sdr$alphas)` instead
+#      of only `sdr$alpha`.
+# (**) LSIR already considured the covariance estinates
 
-## times: 2022.05.24
+#            method       npc  Acc  Err  FPR  TPR  FNR  TNR  AUC sd(AUC)
+# 1    hopir.ls.icu    (3, 4) 0.80 0.20 0.33 0.87 0.13 0.67 0.85   0.036
+# 2   hopir.mle.icu    (3, 4) 0.80 0.20 0.36 0.88 0.12 0.64 0.85   0.036
+# 3   hopir.ls.nagd    (3, 4) 0.80 0.20 0.33 0.87 0.13 0.67 0.85   0.036
+# 4  hopir.mle.nagd    (3, 4) 0.80 0.20 0.33 0.87 0.13 0.67 0.85   0.036
+# 5    hopir.ls.icu  (15, 15) 0.79 0.21 0.38 0.88 0.12 0.62 0.83   0.041
+# 6   hopir.mle.icu  (15, 15) 0.77 0.23 0.40 0.87 0.13 0.60 0.83   0.041
+# 7   hopir.ls.nagd  (15, 15) 0.79 0.21 0.38 0.88 0.12 0.62 0.83   0.041
+# 8  hopir.mle.nagd  (15, 15) 0.76 0.24 0.47 0.90 0.10 0.53 0.81   0.043
+# 9    hopir.ls.icu  (20, 30) 0.75 0.25 0.38 0.83 0.17 0.62 0.80   0.045
+# 10  hopir.mle.icu  (20, 30) 0.75 0.25 0.40 0.83 0.17 0.60 0.83   0.039
+# 11  hopir.ls.nagd  (20, 30) 0.75 0.25 0.38 0.83 0.17 0.62 0.80   0.045
+# 12 hopir.mle.nagd  (20, 30) 0.75 0.25 0.42 0.86 0.14 0.58 0.80   0.044
+# 13   hopir.ls.icu (256, 64) 0.67 0.33 0.53 0.79 0.21 0.47 0.69   0.052
+
+## times: 2022.05.24 + 2022.06.14
 #               method       npc elapsed sys.self user.self
 # 1          kpir.base    (3, 4)   0.079    0.402     0.220
 # 2       kpir.new.vlp    (3, 4)   0.075    0.393     0.217
@@ -254,13 +298,24 @@ print(times, digits = 2)
 # 4            kpir.ls    (3, 4)   0.003    0.006     0.006
 # 5  kpir.momentum.vlp    (3, 4)   0.143    0.595     0.359
 # 6   kpir.momentum.ls    (3, 4)   0.297    0.252     0.385
+#     (*) hopir.ls.icu    (3, 4)   0.004    0.009     0.008
+#    (*) hopir.mle.icu    (3, 4)   0.004    0.008     0.007
 # 7    kpir.approx.vlp    (3, 4)   0.044    0.240     0.152
 # 8     kpir.approx.ls    (3, 4)   0.066    0.144     0.121
+#                 LSIR    (3, 4)   0.003    0.000     0.003
 # 9            kpir.ls  (15, 15)   0.012    0.059     0.034
+#     (*) hopir.ls.icu  (15, 15)   0.018    0.077     0.043
+#    (*) hopir.mle.icu  (15, 15)   0.018    0.084     0.043
 # 10    kpir.approx.ls  (15, 15)   0.813    3.911     2.325
+#                 LSIR  (15, 15)   0.011    0.031     0.024
+#     (*) hopir.ls.icu  (20, 30)   0.037    0.165     0.098
+#    (*) hopir.mle.icu  (20, 30)   0.036    0.163     0.090
 # 11           kpir.ls  (20, 30)   0.028    0.129     0.080
 # 12    kpir.approx.ls  (20, 30)   2.110   10.111     6.290
+#                 LSIR  (20, 30)   0.038    0.119     0.102
 # 13           kpir.ls (256, 64)   1.252    6.215     3.681
+#     (*) hopir.ls.icu (256, 64)   1.120    4.018     2.979
+#    (*) hopir.mle.icu (256, 64)   1.183    4.109     2.974
 # 14    kpir.approx.ls (256, 64)  36.754  141.028   147.490
-
-
+#
+#  (*) While in Zoom meeting
diff --git a/simulations/kpir_sim.R b/simulations/kpir_sim.R
index b457f37..6f613b6 100644
--- a/simulations/kpir_sim.R
+++ b/simulations/kpir_sim.R
@@ -226,8 +226,8 @@ Delta.2 <- sqrt(0.5)^abs(outer(seq_len(q), seq_len(q), `-`))
 for (rep in 1:reps) {
     cat(sprintf("\n\033[1m%4d / %d simulation rep. started\033[0m\n", rep, reps))
 
-    alpha.1.true <- alpha.1 <- matrix(rnorm(q * r), q, r)
-    alpha.2.true <- alpha.2 <- matrix(rnorm(p * k), p, k)
+    alpha.1.true <- alpha.1 <- matrix(rnorm(p * k), p, k)
+    alpha.2.true <- alpha.2 <- matrix(rnorm(q * r), q, r)
     y <- rnorm(n)
     Fy <- do.call(cbind, Map(function(slope, offset) {
             sin(slope * y + offset)
@@ -236,10 +236,10 @@ for (rep in 1:reps) {
         head(rep(c(0, pi / 2), ceiling(0.5 * k * r)), k * r)
     ))
     dim(Fy) <- c(n, k, r)
-    X <- mlm(Fy, alpha.1, alpha.2, modes = 3:2)
+    X <- mlm(Fy, list(alpha.1, alpha.2), 2:3)
     X <- X + rtensornorm(n, 0, Delta.1, Delta.2, sample.axis = 1L)
 
-    hist.sim <- sim(X, Fy, alpha.1.true, alpha.2.true, max.iter = max.iter)
+    hist.sim <- sim(X, Fy, alpha.2.true, alpha.1.true, max.iter = max.iter)
     hist.sim$repetition <- rep
 
     hist <- rbind(hist, hist.sim)
diff --git a/tensorPredictors/DESCRIPTION b/tensorPredictors/DESCRIPTION
index e109f62..778a722 100644
--- a/tensorPredictors/DESCRIPTION
+++ b/tensorPredictors/DESCRIPTION
@@ -12,4 +12,5 @@ Depends: R(>= 3.0)
 Imports: stats
 Suggests: RSpectra
 Encoding: UTF-8
+Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.1.1
diff --git a/tensorPredictors/NAMESPACE b/tensorPredictors/NAMESPACE
index 9948cf4..6c7a884 100644
--- a/tensorPredictors/NAMESPACE
+++ b/tensorPredictors/NAMESPACE
@@ -6,15 +6,31 @@ export("%x_2%")
 export("%x_3%")
 export("%x_4%")
 export(CISE)
+export(D)
+export(D.pinv)
+export(GMLM)
+export(GMLM.default)
+export(HOPCA)
+export(HOPIR)
+export(HOSVD)
+export(ICU)
+export(K)
+export(K.perm)
 export(LSIR)
+export(N)
+export(NAGD)
 export(PCA2d)
 export(POI)
 export(RMReg)
+export(RMap)
+export(S)
 export(approx.kronecker)
 export(colKronecker)
+export(dist.kron.norm)
+export(dist.kron.tr)
 export(dist.projection)
 export(dist.subspace)
-export(hopca)
+export(exprs.all.equal)
 export(kpir.approx)
 export(kpir.base)
 export(kpir.ls)
@@ -24,12 +40,20 @@ export(kpir.new)
 export(mat)
 export(matpow)
 export(matrixImage)
+export(mcov)
 export(mcrossprod)
+export(mkm)
 export(mlm)
+export(mtvk)
+export(num.deriv)
 export(reduce)
 export(rowKronecker)
 export(rtensornorm)
 export(tensor_predictor)
 export(ttm)
+export(vech)
+export(vech.index)
+export(vech.pinv)
+export(vech.pinv.index)
 import(stats)
 useDynLib(tensorPredictors, .registration = TRUE)
diff --git a/tensorPredictors/R/GMLM.R b/tensorPredictors/R/GMLM.R
new file mode 100644
index 0000000..7a06a19
--- /dev/null
+++ b/tensorPredictors/R/GMLM.R
@@ -0,0 +1,316 @@
+#' Fitting Generalized Multi-Linear Models
+#'
+#' @export
+GMLM <- function(...) {
+    stop("Not Implemented")
+}
+
+
+make.gmlm.family <- function(name) {
+    # standardize family name
+    name <- list(
+        normal = "normal", gaussian = "normal",
+        bernoulli = "bernoulli", ising = "bernoulli"
+    )[[tolower(name), exact = FALSE]]
+
+    ############################################################################
+    #                                                                          #
+    #    TODO: better (and possibly specialized) initial parameters!?!?!?!     #
+    #                                                                          #
+    ############################################################################
+
+    switch(name,
+        normal = {
+            initialize <- function(X, Fy) {
+                # observation/predictor tensor order
+                p <- head(dim(X), -1)
+                q <- head(dim(Fy), -1)
+                r <- length(dim(X)) - 1L
+
+                # mu = E[X] = E[E[X | Y]]
+                mu <- rowMeans(X, dims = r)
+                # covariance of X (non conditional estimate)
+                Deltas <- mcov(X, sample.axis = r + 1L)
+                Omegas <- Map(solve, Deltas)
+
+                # GLM intercept
+                eta1 <- mlm(mu, Omegas)
+
+                # initialize GLM reduction matrices
+                Sigmas <- mcov(Fy, sample.axis = r + 1L)
+                alphas <- Map(function(j) {
+                    s <- min(p[j], q[j])
+                    L <- with(La.svd(Deltas[[j]]), {
+                        u[, 1:s] %*% diag(d[1:s]^-0.5, s, s)
+                    })
+                    R <- with(La.svd(Sigmas[[j]]), {
+                        diag(d[1:s]^-0.5, s, s) %*% vt[1:s, ]
+                    })
+                    L %*% R
+                }, seq_len(r))
+
+                list(
+                    eta1 = eta1,
+                    alphas = alphas,
+                    Omegas = Omegas
+                )
+            }
+
+            # parameters of the tensor normal computed from the GLM parameters
+            params <- function(Fy, eta1, alphas, Omegas) {
+                Deltas <- Map(solve, Omegas)
+                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
+                list(mu_y = mu_y, Deltas = Deltas)
+            }
+
+            # scaled negative log-likelihood
+            log.likelihood <- function(X, Fy, eta1, alphas, Omegas) {
+                n <- tail(dim(X), 1)        # sample size
+
+                # conditional mean
+                mu_y <- mlm(mlm(Fy, alphas) + c(eta1), Map(solve, Omegas))
+
+                # negative log-likelihood scaled by sample size
+                # Note: the `suppressWarnings` is cause `log(mapply(det, Omegas)`
+                # migth fail, but `NAGD` has failsaves againt cases of "illegal"
+                # parameters.
+                suppressWarnings(
+                    0.5 * prod(p) * log(2 * pi) +
+                        sum((X - mu_y) * mlm(X - mu_y, Omegas)) / (2 * n) -
+                            (0.5 * prod(p)) * sum(log(mapply(det, Omegas)) / p)
+                )
+            }
+            # gradient of the scaled negative log-likelihood
+            grad <- function(X, Fy, eta1, alphas, Omegas) {
+                # retrieve dimensions
+                n <- tail(dim(X), 1)        # sample size
+                p <- head(dim(X), -1)       # predictor dimensions
+                q <- head(dim(Fy), -1)      # response dimensions
+                r <- length(p)              # single predictor/response tensor order
+
+                ## "Inverse" Link: Tensor Normal Specific
+                # known exponential family constants
+                c1 <- 1
+                c2 <- -0.5
+
+                # Covariances from the GLM parameter Scatter matrices
+                Deltas <- Map(solve, Omegas)
+
+                # First moment via "inverse" link `g1(eta_y) = E[X | Y = y]`
+                E1 <- mlm(mlm(Fy, alphas) + c(eta1), Deltas)
+
+                # Second moment via "inverse" link `g2(eta_y) = E[vec(X) vec(X)' | Y = y]`
+                dim(E1) <- c(prod(p), n)
+                E2 <- Reduce(`%x%`, rev(Deltas)) + rowMeans(colKronecker(E1, E1))
+                ## end "Inverse" Link
+
+                dim(X) <- c(prod(p), n)
+
+                # Residuals
+                R <- X - E1
+                dim(R) <- c(p, n)
+
+                # mean deviation between the sample covariance to GLM estimated covariance
+                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
+                S <- rowMeans(colKronecker(X, X)) - E2  # <- Optimized for Tensor Normal
+                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`
+
+                # Gradients of the negative log-likelihood scaled by sample size
+                list(
+                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
+                    "Dl(alphas)" = Map(function(j) {
+                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
+                    }, 1:r),
+                    "Dl(Omegas)" = Map(function(j) {
+                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
+                        # addapt to symmetric constraint for the derivative
+                        dim(deriv) <- c(p[j], p[j])
+                        deriv + t(deriv * (1 - diag(p[j])))
+                    }, 1:r)
+                )
+            }
+        },
+        bernoulli = {
+            require(mvbernoulli)
+
+            initialize <- function(X, Fy) {
+                # retrieve dimensions
+                n <- tail(dim(X), 1)        # sample size
+                p <- head(dim(X), -1)       # predictor dimensions
+                q <- head(dim(Fy), -1)      # response dimensions
+                r <- length(p)              # single predictor/response tensor order
+
+                # Half vectorized two-way interaction stats E[vech(vec(X) vec(X)')]
+                dim(X) <- c(prod(p), n)
+                T2 <- rowMeans(colKronecker(X, X)[vech.index(prod(p)), ])
+
+                # If there are any 0 or 1 entries in T2, then theta contains
+                # +-infinity corresponding to certain/impossible events.
+                # Make this robust by squishing the domain a bit!
+                T2 <- 0.01 + 0.98 * T2
+
+                # take the expected two-way marginal probability estimate and
+                # equat them with the expected contitional probs from which
+                # we compute a joint (over all observations) estimate of theta.
+                theta0 <- ising_theta_from_cond_prob(T2)
+
+
+
+                list(
+                    eta1 = eta1,
+                    alphas = alphas,
+                    Omegas = Omegas
+                )
+            }
+
+            params <- function(Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
+                # number of observations
+                n <- tail(dim(Fy), 1)
+
+                # natural exponential family parameters
+                eta_y1 <- c1 * (mlm(Fy, alphas) + c(eta1))
+                eta_y2 <- c2 * Reduce(`%x%`, rev(Omegas))
+
+                # next the conditional Ising model parameters `theta_y`
+                theta_y <- rep(eta_y2[lower.tri(eta_y2, diag = TRUE)], n)
+                dim(theta_y) <- c(nrow(eta_y2) * (nrow(eta_y2) + 1) / 2, n)
+
+                ltri <- which(lower.tri(eta_y2, diag = TRUE))
+                diagonal <- which(diag(TRUE, nrow(eta_y2))[ltri])
+                theta_y[diagonal, ] <- theta_y[diagonal, ] + c(eta_y1)
+                theta_y[-diagonal, ] <- 2 * theta_y[-diagonal, ]
+
+                theta_y
+            }
+
+            # Scaled ngative log-likelihood
+            log.likelihood <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
+                # number of observations
+                n <- tail(dim(X), 1L)
+
+                # conditional Ising model parameters
+                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
+
+                # convert to binary data set
+                X.mvb <- as.mvbinary(mat(X, length(dim(X))))
+
+                # log-likelihood of the data set
+                -mean(sapply(seq_len(n), function(i) {
+                    ising_log_likelihood(theta_y[, i], X.mvb[i])
+                }))
+            }
+
+            # Gradient of the scaled negative log-likelihood
+            grad <- function(X, Fy, eta1, alphas, Omegas, c1 = 1, c2 = 1) {
+                # retrieve dimensions
+                n <- tail(dim(X), 1)        # sample size
+                p <- head(dim(X), -1)       # predictor dimensions
+                q <- head(dim(Fy), -1)      # response dimensions
+                r <- length(p)              # single predictor/response tensor order
+
+                ## "Inverse" Link: Ising Model Specific
+                # conditional Ising model parameters: `p (p + 1) / 2` by `n`
+                theta_y <- params(Fy, eta1, alphas, Omegas, c1, c2)
+
+                # conditional expectations
+                #   ising_marginal_probs(theta_y) = E[vech(vec(X) vec(X)') | Y = y]
+                E2 <- apply(theta_y, 2L, ising_marginal_probs)
+                #   convert E[vech(vec(X) vec(X)') | Y = y] to E[vec(X) vec(X)' | Y = y]
+                E2 <- E2[vech.pinv.index(prod(p)), ]
+                #   extract diagonal elements which are equal to E[vec(X) | Y = y]
+                E1 <- E2[seq.int(from = 1L, to = prod(p)^2, by = prod(p) + 1L), ]
+                ## end "Inverse" Link
+
+                dim(X) <- c(prod(p), n)
+
+                # Residuals
+                R <- X - E1
+                dim(R) <- c(p, n)
+
+                # mean deviation between the sample covariance to GLM estimated covariance
+                # `n^-1 sum_i (vec(X_i) vec(X_i)' - g2(eta_yi))`
+                S <- rowMeans(colKronecker(X, X) - E2)
+                dim(S) <- c(p, p)           # reshape to tensor or order `2 r`
+
+                # Gradients of the negative log-likelihood scaled by sample size
+                list(
+                    "Dl(eta1)" = -c1 * rowMeans(R, dims = r),
+                    "Dl(alphas)" = Map(function(j) {
+                        (-c1 / n) * mcrossprod(R, mlm(Fy, alphas[-j], (1:r)[-j]), j)
+                    }, 1:r),
+                    "Dl(Omegas)" = Map(function(j) {
+                        deriv <- -c2 * mtvk(mat(S, c(j, j + r)), rev(Omegas[-j]))
+                        # addapt to symmetric constraint for the derivative
+                        dim(deriv) <- c(p[j], p[j])
+                        deriv + t(deriv * (1 - diag(p[j])))
+                    }, 1:r)
+                )
+            }
+        }
+    )
+
+    list(
+        family = name,
+        initialize = initialize,
+        params = params,
+        # linkinv = linkinv,
+        log.likelihood = log.likelihood,
+        grad = grad
+    )
+}
+
+
+#' @export
+GMLM.default <- function(X, Fy, sample.axis = 1L,
+    family = "normal",
+    ...,
+    eps = sqrt(.Machine$double.eps),
+    logger = NULL
+) {
+    stopifnot(exprs = {
+        (dim(X) == dim(Fy))[sample.axis]
+    })
+
+    # rearrange `X`, `Fy` such that the last axis enumerates observations
+    axis.perm <- c(seq_along(dim(X))[-sample.axis], sample.axis)
+    X  <- aperm(X,  axis.perm)
+    Fy <- aperm(Fy, axis.perm)
+
+    # setup family specific GLM (pseudo) "inverse" link
+    family <- make.gmlm.family(family)
+
+    # wrap logger in callback for NAGD
+    callback <- if (is.function(logger)) {
+        function(iter, params) {
+            do.call(logger, c(list(iter), params))
+        }
+    }
+
+    params.fit <- NAGD(
+        fun.loss = function(params) {
+            # scaled negative lig-likelihood
+            #                            eta1         alphas       Omegas
+            family$log.likelihood(X, Fy, params[[1]], params[[2]], params[[3]])
+        },
+        fun.grad = function(params) {
+            # gradient of the scaled negative lig-likelihood
+            #                  eta1         alphas       Omegas
+            family$grad(X, Fy, params[[1]], params[[2]], params[[3]])
+        },
+        params = family$initialize(X, Fy), # initialen parameter estimates
+        fun.lincomb = function(a, lhs, b, rhs) {
+            list(
+                a * lhs[[1]] + b * rhs[[1]],
+                Map(function(l, r) a * l + b * r, lhs[[2]], rhs[[2]]),
+                Map(function(l, r) a * l + b * r, lhs[[3]], rhs[[3]])
+            )
+        },
+        fun.norm2 = function(params) {
+            sum(unlist(params)^2)
+        },
+        callback = callback,
+        ...
+    )
+
+    structure(params.fit, names = c("eta1", "alphas", "Omegas"))
+}
diff --git a/tensorPredictors/R/hoPCA.R b/tensorPredictors/R/HOPCA.R
similarity index 82%
rename from tensorPredictors/R/hoPCA.R
rename to tensorPredictors/R/HOPCA.R
index 0d49e63..e6b217d 100644
--- a/tensorPredictors/R/hoPCA.R
+++ b/tensorPredictors/R/HOPCA.R
@@ -7,11 +7,10 @@
 #'
 #' @return list of matrices, each entry are the first PCs of the corresponding
 #'  axis. The `i`'th entry are the `npc[i]` first Principal Components of the
-#'  `i`th axis excluding the sample axis (note: this means there is an index
-#'  shift after the sample axis).
+#'  `i`th axis excluding the sample axis.
 #'
 #' @export
-hopca <- function(X, npc = dim(X)[-sample.axis], sample.axis = 1L) {
+HOPCA <- function(X, npc = dim(X)[-sample.axis], sample.axis = 1L) {
     # observation index numbers (all axis except the sample axis)
     modes <- seq_along(dim(X))[-sample.axis]
 
@@ -22,7 +21,7 @@ hopca <- function(X, npc = dim(X)[-sample.axis], sample.axis = 1L) {
 
     # PCA for each mode (axis)
     PCs <- Map(function(i) {
-        La.svd(mcrossprod(X.centered, modes[i]), npc[i], 0)$u
+        La.svd(mcrossprod(X.centered, mode = modes[i]), npc[i], 0)$u
     }, seq_along(modes))
 
     # Set names if any
diff --git a/tensorPredictors/R/HOPIR.R b/tensorPredictors/R/HOPIR.R
new file mode 100644
index 0000000..5d21183
--- /dev/null
+++ b/tensorPredictors/R/HOPIR.R
@@ -0,0 +1,293 @@
+#' HOPIR subroutine for the LS solution given preprocessed `X`, `Fy` and initial
+#' values `alphas`.
+#'
+#' @keywords internal
+HOPIR.ls <- function(X, Fy, alphas, sample.axis, algorithm, ..., logger) {
+    # Get axis indices (observation modes)
+    modes <- seq_along(dim(X))[-sample.axis]
+    n <- dim(X)[sample.axis]                    # observation count (scalar)
+    p <- dim(X)[-sample.axis]                   # predictor dimensions (vector)
+
+    # Least Squares Deltas Estimates given alphas
+    fun.Deltas <- function(alphas) {
+        # Residuals
+        R <- X - mlm(Fy, alphas, modes = modes)
+        # `Delta` moment estimates
+        Deltas <- Map(mcrossprod, list(R), mode = modes)
+        Map(`*`, p / (n * prod(p)), Deltas)
+    }
+
+    # wrap logger, provide unified logger interface for all HOPIR subroutines
+    if (is.function(logger)) {
+        callback <- function(iter, alphas) {
+            logger("ls", iter, alphas, fun.Deltas(alphas))
+        }
+    } else {
+        callback <- NULL
+    }
+
+    if (algorithm == "icu") {
+        # Call (proper parameterized) Iterative Cyclic Updating Optimizer
+        alphas <- ICU(
+            # Optimization Objective (MSE)
+            fun.loss = function(alphas) mean((X - mlm(Fy, alphas, modes))^2),
+            # Updating rule (optimal solution for `alpha_j` given the rest)
+            fun.update = function(alphas, j) {
+                Z <- mlm(Fy, alphas[-j], modes = modes[-j])
+                # least squares solution for `alpha_j | alpha_i, i != j`
+                alphas[[j]] <- t(solve(
+                    mcrossprod(Z, Z, modes[j]), mcrossprod(Z, X, modes[j])
+                ))
+            },
+            # Initial parameter estimates
+            params = alphas,
+            ...,
+            callback = callback)
+    } else {
+        # Call (proper parameterized) Nesterov Accelerated Gradient Descent
+        alphas <- NAGD(
+            # Setup objective function (MSE)
+            fun.loss = function(alphas) mean((X - mlm(Fy, alphas, modes))^2),
+            # Gradient of the objective with respect to the parameter matirces `alpha_j`
+            fun.grad = function(alphas) {
+                # Residuals
+                R <- X - mlm(Fy, alphas, modes)
+                # Gradients for each alpha
+                Map(function(j) {
+                    # MLM of Fy with alpha_k, k in [r] \ j
+                    Fa <- mlm(Fy, alphas[-j], modes[-j])
+                    # Gradient of the loss with respect to alpha_j
+                    (-2 / prod(dim(X))) * mcrossprod(R, Fa, modes[j])
+                }, seq_along(modes))
+            },
+            params = alphas,
+            # Linear Combination of parameters, basically: a * lhs + b * rhs for each
+            # combination of elements in the LHS and RHS lists with scalars a, b.
+            fun.lincomb = function(a, LHS, b, RHS) {
+                Map(function(lhs, rhs) a * lhs + b * rhs, LHS, RHS)
+            },
+            # squared norm of parameters
+            fun.norm2 = function(params) sum(unlist(params, use.names = FALSE)^2),
+            ...,
+            callback = callback)
+    }
+
+    # Final estimate includes Deltas
+    list(alphas = alphas, Deltas = fun.Deltas(alphas))
+}
+
+#' HPOIR subroutine for the MLE estimation given proprocessed data and initial
+#' alphas, Deltas paramters
+#'
+#' @keywords internal
+HOPIR.mle <- function(X, Fy, alphas, Deltas, sample.axis, algorithm, ..., logger) {
+    # Get axis indices (observation modes)
+    modes <- seq_along(dim(X))[-sample.axis]
+    n <- dim(X)[sample.axis]                    # observation count (scalar)
+    p <- dim(X)[-sample.axis]                   # predictor dimensions (vector)
+
+    if (algorithm == "icu") {
+        # Call (proper parameterized) Iterative Cyclic Updating Optimizer
+        params <- ICU(
+            # Optimization Objective (negative log-likelihood)
+            fun.loss = function(params) {
+                # residuals
+                R <- X - mlm(Fy, params$alphas, modes)
+                # negative log-likelihood (without additive constant term)
+                0.5 * (
+                    n * prod(p) * sum(log(unlist(Map(det, params$Deltas))) / p) +
+                    sum(mlm(R, Map(solve, params$Deltas), modes) * R)
+                )
+            },
+            # Updating rule, optimal solution for `alpha_j` or `Delta_j` given
+            # all other parameters
+            fun.update = function(params, index) {
+                # residuals
+                R <- X - mlm(Fy, params$alphas, modes)
+                # mode (axis) index
+                j <- index[2]
+                if (index[1] == 1) {
+                    # compute subterms
+                    Delta.invs <- Map(solve, params$Deltas)
+                    Delta.inv.alphas <- Map(`%*%`, Delta.invs, alphas)
+                    XxDi  <- mlm(X,  Delta.invs[-j],       modes[-j])
+                    Fxa   <- mlm(Fy,           alphas[-j], modes[-j])
+                    FxDia <- mlm(Fy, Delta.inv.alphas[-j], modes[-j])
+                    # alpha update
+                    mcrossprod(XxDi, Fxa, modes[j]) %*%
+                        solve(mcrossprod(FxDia, Fxa, modes[j]))
+                } else { # index[1] == 2
+                    # Delta update
+                    (p[j] / (n * prod(p))) * mcrossprod(
+                        mlm(R, Map(solve, params$Deltas[-j]), modes[-j]), R, modes[j])
+                }
+            },
+            # collection of initial alpha and Delta parameters
+            params = list(alphas = alphas, Deltas = Deltas),
+            # parameter "path"-indices, first index {1, 2} toggles between alphas
+            # and Deltas, second index is the mode.
+            # Example: `list(list("a1", "a2", "a3"), list("D1", "D2", "D3"))[[c(2, 1)]] == "D1"`
+            indices = c(Map(c, 1L, seq_along(modes)), Map(c, 2L, seq_along(modes))),
+            ...,
+            callback = if (is.function(logger)) {
+                    function(iter, params) {
+                        logger("mle", iter, params$alphas, params$Deltas)
+                    }
+                } else {
+                    NULL
+                })
+    } else {
+        # Call (proper parameterized) Nesterov Accelerated Gradient Descent
+        # Note that only the `alphas` are subject of Gradient Descent and the
+        # `Deltas` are additional parameters updated given the cueent `alphas`.
+        # Meaning that the gradient is the gradient of the loss with respect to
+        # the `alphas` only.
+        params <- NAGD(
+            # Setup objective function (negative log-likelihood)
+            fun.loss = function(alphas, Deltas) {
+                # residuals
+                R <- X - mlm(Fy, alphas, modes)
+                # negative log-likelihood (without additive constant term)
+                0.5 * (
+                    n * prod(p) * sum(log(unlist(Map(det, Deltas))) / p) +
+                    sum(mlm(R, Map(solve, Deltas), modes) * R)
+                )
+            },
+            # Gradient of the objective with respect to the parameter matirces
+            # `alphas` only, only the first argument, the second are `more.params`.
+            fun.grad = function(alphas, Deltas) {
+                # Residuals
+                R <- X - mlm(Fy, alphas, modes)
+                # Gradients for each alpha
+                Map(function(j) {
+                    # MLM of Fy with alpha_k, k in [r] \ j
+                    Fa <- mlm(Fy, alphas[-j], modes[-j])
+                    # Gradient of the loss with respect to alpha_j
+                    (-2 / prod(dim(X))) * mcrossprod(R, Fa, modes[j])
+                }, seq_along(modes))
+            },
+            # Initial parameters (subject to Gradient Descent)
+            params = alphas,
+            # Initial additional parameters (subject to updating given `params`)
+            more.params = Deltas,
+            # Update `Deltas` given `alphas`
+            fun.more.params = function(alphas, old.Deltas) {
+                # Residuals
+                R <- X - mlm(Fy, alphas, modes)
+                # Solve cross dependent System of Delta equations using the ICU
+                # algorithm (with a small number of iterations, should be enough)
+                ICU(
+                    fun.loss = function(Deltas) {
+                        # negative log-likelihood (without additive constant term)
+                        0.5 * (
+                            n * prod(p) * sum(log(unlist(Map(det, Deltas))) / p) +
+                            sum(mlm(R, Map(solve, Deltas), modes) * R)
+                        )
+                    },
+                    fun.update = function(Deltas, j) {
+                        # Delta update
+                        (p[j] / (n * prod(p))) * mcrossprod(
+                            mlm(R, Map(solve, Deltas[-j]), modes[-j]), R, modes[j])
+                    },
+                    params = old.Deltas,
+                    max.iter = 5L)
+            },
+            # Linear Combination of parameters, basically: a * lhs + b * rhs for each
+            # combination of elements in the LHS and RHS lists with scalars a, b.
+            fun.lincomb = function(a, LHS, b, RHS) {
+                Map(function(lhs, rhs) a * lhs + b * rhs, LHS, RHS)
+            },
+            # squared norm of parameters
+            fun.norm2 = function(params) sum(unlist(params, use.names = FALSE)^2),
+            ...,
+            callback = if (is.function(logger)) {
+                    function(iter, alphas, Deltas) {
+                        logger("mle", iter, alphas, Deltas)
+                    }
+                } else {
+                    NULL
+                })
+        # Remap parameter names
+        names(params) <- c("alphas", "Deltas")
+    }
+
+    params
+}
+
+#' Higher Order Parametric Inverse Regression
+#'
+#' @export
+HOPIR <- function(X, Fy, sample.axis, method = c("ls", "mle"),
+    algorithm = c("icu", "nagd"), ..., center = TRUE, logger = NULL
+) {
+    # Predair and check input parameters
+    method <- match.arg(method)
+    algorithm <- match.arg(algorithm)
+    # ensure response is tensor valued
+    if (!is.array(Fy)) {
+        # scalar response case (add new axis of size 1)
+        dim(Fy) <- ifelse(seq_along(dim(X)) == sample.axis, dim(X)[sample.axis], 1L)
+    }
+    # Check dimensions and matching of axis (tensor order)
+    stopifnot(exprs = {
+        length(dim(X)) == length(dim(Fy))
+        dim(X)[sample.axis] == dim(Fy)[sample.axis]
+    })
+    # warn about occurence of an axis without reduction
+    if (any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])) {
+        warning("Degenerate case 'any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])'")
+    }
+
+    # Set the default logger
+    if (is.logical(logger) && logger) {
+        start <- Sys.time()
+        logger <- function(method, iter, ...) {
+            if (iter) {
+                cat(sprintf("%4d - %s - Elapsed: %s\n",
+                    iter, method, format(Sys.time() - start)))
+            }
+        }
+    }
+
+    # Get axis indices (observation modes)
+    modes <- seq_along(dim(X))[-sample.axis]
+    n <- dim(X)[sample.axis]                    # observation count (scalar)
+    p <- dim(X)[-sample.axis]                   # predictor dimensions (vector)
+
+    # center data (predictors and responses)
+    if (center) {
+        # Means for X and Fy (a.k.a. sum elements over the sample axis)
+        meanX <- apply(X, modes, mean, simplify = TRUE)
+        meanFy <- apply(Fy, modes, mean, simplify = TRUE)
+        # Center both X and Fy
+        X <- sweep(X, modes, meanX)
+        Fy <- sweep(Fy, modes, meanFy)
+    } else {
+        meanX <- meanFy <- NA
+    }
+
+
+    ### Step 0: Initial parameter estimates HOPCA
+    alphas <- Map(function(mode, ncol) {
+        La.svd(mcrossprod(X, mode = mode), ncol)$u
+    }, modes, dim(Fy)[modes])
+
+    ### Step 1: LS estimate
+    ls <- HOPIR.ls(X, Fy, alphas, sample.axis,
+        algorithm, ..., logger = logger)
+
+    ### Step 2: MLE estimate
+    if (method == "mle") {
+        mle <- HOPIR.mle(X, Fy, ls$alphas, ls$Deltas, sample.axis,
+            algorithm, ..., logger = logger)
+
+        # add means and LS estimates as attribute
+        structure(
+            c(mle, list(meanX = meanX, meanFy = meanFy)),
+            ls = ls
+        )
+    } else {
+        # add means to LS estimates
+        c(ls, list(meanX = meanX, meanFy = meanFy))
+    }
+}
diff --git a/tensorPredictors/R/HOSVD.R b/tensorPredictors/R/HOSVD.R
new file mode 100644
index 0000000..68392fa
--- /dev/null
+++ b/tensorPredictors/R/HOSVD.R
@@ -0,0 +1,23 @@
+#' Higher Order Singular Value Decomposition
+#'
+#' @param X multi-dimensional array (at least a matrix)
+#' @param nu Number of Singula Vector per mode. Defaults to a complete HO-SVD.
+#' @param eps tolerance for detecting linear dependence in columns of a matrix.
+#'  Used for rank deduction and passed to \code{\link{qr}}.
+#'
+#' @export
+HOSVD <- function(X, nu = NULL, eps = 1e-07) {
+    if (!missing(nu)) {
+        stopifnot(all(nu <= dim(X)))
+    }
+
+    # Compute per mode singular vectors
+    Us <- Map(function(i) {
+        xx <- mcrossprod(X, , i)
+        La.svd(xx, if (is.null(nu)) qr(xx, tol = eps)$rank else nu[i], 0)$u
+    }, seq_along(dim(X)))
+    # Compute Core tensor
+    C <- mlm(X, Map(t, Us))
+
+    list(C = C, Us = Us)
+}
diff --git a/tensorPredictors/R/ICU.R b/tensorPredictors/R/ICU.R
new file mode 100644
index 0000000..030923e
--- /dev/null
+++ b/tensorPredictors/R/ICU.R
@@ -0,0 +1,56 @@
+#' Iterative Cyclic (Coordinate) Update
+#'
+#' @param fun.loss Scalar loss function (minimization objective), its signature
+#'  is \code{function(params)} and return a scalar.
+#' @param fun.update compute new parameter (parameter block) for the \code{index}
+#'  parameter (block) in \code{params}, the function signature is
+#'  \code{function(params, index)} and returns a parameter block corresponding
+#'  to \code{fun.getElement(params, index)}
+#' @param params initial paramiters, a.k.a. start position
+#' @param indices parameter index set used with \code{[[<-} and passed to
+#'  \code{fun.update}
+#' @param fun.sample computes a permutation of indices. If the parameters
+#'  should not be permuted use \code{identity}.
+#' @param max.iter maximum number of parameter update cycles
+#' @param eps small constant used in break conditions
+#' @param callback function invoked for each iteration (including iteration 0)
+#'  with the signature \code{function(iter, params)}.
+#'
+#' @example inst/examples/ICU.R
+#'
+#' @export
+ICU <- function(fun.loss, fun.update, params,
+    indices = base::seq_along(params),
+    fun.sample = base::sample,
+    max.iter = 50L,
+    eps = .Machine$double.eps,
+    callback = NULL
+) {
+    # Compute initial loss
+    loss <- fun.loss(params)
+
+    # Call callback for with initial parameters
+    if (is.function(callback)) callback(0L, params)
+
+    # iteration loop of parameter update cycles
+    for (iter in seq_len(max.iter)) {
+        # Random order parameter update cycle
+        for (index in fun.sample(indices)) {
+            params[[index]] <- fun.update(params, index)
+        }
+
+        # Call callback after each update cycle
+        if (is.function(callback)) callback(iter, params)
+
+        # recompute loss for brack condition
+        loss.last <- loss
+        loss <- fun.loss(params)
+
+        # and check break condition
+        if (abs(loss.last - loss) < eps) {
+            break
+        }
+   }
+
+    params
+}
diff --git a/tensorPredictors/R/LSIR.R b/tensorPredictors/R/LSIR.R
index 1fef43a..ccfcdc7 100644
--- a/tensorPredictors/R/LSIR.R
+++ b/tensorPredictors/R/LSIR.R
@@ -15,7 +15,7 @@
 LSIR <- function(X, y, p, t, k = 1L, r = 1L) {
     # the code assumes:
     # alpha: T x r, beta: p x k, X_i: p x T, for ith observation
-    
+
     # Check and transform parameters.
     if (!is.matrix(X)) X <- as.matrix(X)
     n <- nrow(X)
diff --git a/tensorPredictors/R/NAGD.R b/tensorPredictors/R/NAGD.R
new file mode 100644
index 0000000..b59d59f
--- /dev/null
+++ b/tensorPredictors/R/NAGD.R
@@ -0,0 +1,164 @@
+#' Nesterov Accelerated Gradient Descent
+#'
+#' Minimized \code{fun.loss} given its gradient \code{fun.grad} from initial
+#' position \code{params}. This generiv implementation allows for structured
+#' parameters provided that the function \code{fun.lincomb} and \code{fun.norm2}
+#' can handle the parameters appropriately.
+#'
+#' @param fun.loss Scalar loss function (minimization objective), its signature
+#'  is \code{function(params)} or \code{function(params, more.params)} if
+#'  \code{more.params} is not missing and its return is assumed to be a scalar.
+#' @param fun.grad Gradient of \code{fun.loss} with signature
+#'  \code{function(params)} or \code{function(params, more.params)} if
+#'  \code{more.params} is not missing and its return is assumed to be \code{params}.
+#' @param params initial paramiters, a.k.a. start position
+#' @param more.params further parameters not subject to optimization. They might
+#'  change during optimization as result of a call to \code{fun.more.params}.
+#' @param fun.more.params function of signature
+#'  \code{function(params, more.params)} if \code{more.params} is not missing.
+#'  This is called whenever \code{params} where updated if \code{more.params}
+#'  are not missing.
+#' @param fun.lincomb linear combination of parameters, see examples.
+#' @param fun.norm2 squared norm of parameters, applied to \code{fun.grad} output
+#' @param max.iter maximum number of gradient updates
+#' @param max.line.iter maximum number of line search iterations
+#' @param step.size initial step size, used in the first iterate as initial
+#'  value in the backtracking line search. Gets addapted during runtime.
+#' @param armijo constant for Armijo condition in the line search
+#' @param gamma line search step size reduction in the open (0, 1) interval
+#' @param eps small constant used in break conditions
+#' @param callback function invoked for each iteration (including iteration 0)
+#'  with the signature \code{function(iter, params)} or
+#'  \code{function(iter, params, more.params)}.
+#'
+#' @return Ether the final parameter estimates \code{params} or a list with
+#'  parameters and more parameters \code{list(params, more.params)} in case
+#'  of non missing \code{more.params}.
+#'
+#' @example inst/examples/NAGD.R
+#'
+#' @export
+NAGD <- function(fun.loss, fun.grad, params, more.params = NULL,
+    fun.more.params = function(params, more.params) more.params,
+    fun.lincomb = function(a, params1, b, params2) a * params1 + b * params2,
+    fun.norm2 = function(params) sum(params^2),
+    max.iter = 50L, max.line.iter = 50L, step.size = 1e-2,
+    armijo = 0.1, gamma = 2 / (1 + sqrt(5)),
+    eps = sqrt(.Machine$double.eps),
+    callback = NULL
+) {
+    # momentum extrapolation weights
+    m <- c(0, 1)
+
+    # Compute initial loss
+    if (missing(more.params)) {
+        loss <- fun.loss(params)
+    } else {
+        loss <- fun.loss(params, more.params)
+    }
+    if (!is.finite(loss)) {
+        stop("Initial loss is non-finite (", loss, ")")
+    }
+    # initialize "previous" iterate parameters
+    params.last <- params
+
+    # Gradient Descent Loop
+    line.search.tag <- FALSE       # init line search state as "failure"
+    for (iter in seq_len(max.iter)) {
+        # Call callback for previous iterate
+        if (missing(more.params)) {
+            if (is.function(callback)) callback(iter - 1L, params)
+        } else {
+            if (is.function(callback)) callback(iter - 1L, params, more.params)
+        }
+
+        # Extrapolation form previous position (momentum)
+        # `params.moment <- (1 + moment) * params - moment * param.last`
+        moment <- (m[1] - 1) / m[2]
+        params.moment <- fun.lincomb(1 + moment, params, -moment, params.last)
+
+        # Compute gradient at extrapolated position
+        if (missing(more.params)) {
+            gradients <- fun.grad(params.moment)
+        } else {
+            more.params <- fun.more.params(params.moment, more.params)
+            gradients <- fun.grad(params.moment, more.params)
+        }
+
+        # gradient inner product (with itself), aka squared norm
+        grad.inner.prod <- fun.norm2(gradients)
+        if (!is.finite(grad.inner.prod)) {
+            stop("Encountered non-finite gradient (", grad.inner.prod,
+                 ") with loss (", loss, ")")
+        }
+
+        # Backtracking like Line Search
+        for (delta in step.size * gamma^seq.int(-1L, length.out = max.line.iter)) {
+            # Gradient Update with current step size
+            params.temp <- fun.lincomb(1, params.moment, -delta, gradients)
+
+            # compute loss at temporary position
+            if (missing(more.params)) {
+                loss.temp <- fun.loss(params.temp)
+            } else {
+                more.params.temp <- fun.more.params(params.temp, more.params)
+                loss.temp <- fun.loss(params.temp, more.params.temp)
+            }
+            loss.temp <- if (is.finite(loss.temp)) loss.temp else Inf
+
+            # check Armijo condition at temporary position
+            if (loss.temp <= loss - armijo * delta * grad.inner.prod) {
+                line.search.tag <- TRUE
+                break
+            }
+        }
+
+        # keep track of previous parameters
+        params.last <- params
+
+        # check line search outcome
+        if (is.na(line.search.tag)) {
+            # line search hopeless -> break algorithm
+            if (missing(more.params)) {
+                return(params)
+            } else {
+                return(list(params = params, more.params = more.params))
+            }
+        } else if (line.search.tag == TRUE) {
+            # line search success -> check break conditions
+            if (abs(loss - loss.temp) < eps * loss) {
+                break
+            }
+            # update loss and parameters
+            loss <- loss.temp
+            params <- params.temp
+            if (!missing(more.params)) {
+                more.params <- more.params.temp
+            }
+            # momentum extrapolation weights
+            m <- c(m[2], (1 + sqrt(1 + (2 * m[2])^2)) / 2)
+            # and the step size
+            step.size <- delta
+            # set line search tag to false for next step
+            line.search.tag <- FALSE
+        } else {
+            # line search failure -> retry without momentum
+            line.search.tag <- NA
+            next
+        }
+    }
+
+    # Call callback with final result
+    if (missing(more.params)) {
+        if (is.function(callback)) callback(iter, params)
+    } else {
+        if (is.function(callback)) callback(iter, params, more.params)
+    }
+
+    # return estimated parameters
+    if (missing(more.params)) {
+        return(params)
+    } else {
+        return(list(params = params, more.params = more.params))
+    }
+}
diff --git a/tensorPredictors/R/RMap.R b/tensorPredictors/R/RMap.R
new file mode 100644
index 0000000..579ea71
--- /dev/null
+++ b/tensorPredictors/R/RMap.R
@@ -0,0 +1,21 @@
+#' Recursive Map
+#'
+#' @examples
+#' RMap(paste,
+#'     list("a", list("b", "c", list("d", "e")), "f"),
+#'     list("A", list("B", "C", list("D", "E")), "F"),
+#'     list(10, list(20), 30),
+#'     1:3,
+#'     "X"
+#' )
+#'
+#' @export
+RMap <- function(f, ...) {
+    Map(function(...) {
+        if (any(unlist(Map(is.recursive, list(...))))) {
+            RMap(f, ...)
+        } else {
+            match.fun(f)(...)
+        }
+    }, ...)
+}
diff --git a/tensorPredictors/R/dist_kron_norm.R b/tensorPredictors/R/dist_kron_norm.R
new file mode 100644
index 0000000..46bf337
--- /dev/null
+++ b/tensorPredictors/R/dist_kron_norm.R
@@ -0,0 +1,39 @@
+#' Squared Frobenius norm of difference of Kronecker product matrices
+#'
+#'  \|(A1 %x% ... %x% Ar - B1 %x% ... %x% Br\|_F
+#'
+#' This is equivalent to the expression
+#' \code{norm(Reduce(kronecker, A) - Reduce(kronecker, B), "F")} but faster.
+#'
+#' @examples
+#' A1 <- matrix(rnorm(5^2), 5)
+#' A2 <- matrix(rnorm(7^2), 7)
+#' B1 <- matrix(rnorm(5^2), 5)
+#' B2 <- matrix(rnorm(7^2), 7)
+#' stopifnot(all.equal(
+#'     dist.kron.norm(list(A1, A2), list(B1, B2)),
+#'     norm(kronecker(A1, A2) - kronecker(B1, B2), "F")
+#' ))
+#'
+#' p <- c(3, 7, 5, 2)
+#' A <- Map(function(pj) matrix(rnorm(pj^2), pj), p)
+#' B <- Map(function(pj) matrix(rnorm(pj^2), pj), p)
+#' stopifnot(all.equal(
+#'     dist.kron.norm(A, B),
+#'     norm(Reduce(kronecker, A) - Reduce(kronecker, B), "F")
+#' ))
+#'
+#' @export
+dist.kron.norm <- function(A, B, eps = .Machine$double.eps) {
+    if (is.list(A) && is.list(B)) {
+        norm2 <- prod(unlist(Map(function(x) sum(x^2), A))) -
+                2 * prod(unlist(Map(function(a, b) sum(a * b), A, B))) +
+                    prod(unlist(Map(function(x) sum(x^2), B)))
+    } else if (is.matrix(A) && is.matrix(B)) {
+        norm2 <- sum((A - B)^2)
+    } else {
+        stop("Unexpected input")
+    }
+
+    if (abs(norm2) < .Machine$double.eps) 0 else sqrt(norm2)
+}
diff --git a/tensorPredictors/R/dist_kron_tr.R b/tensorPredictors/R/dist_kron_tr.R
new file mode 100644
index 0000000..aae5fee
--- /dev/null
+++ b/tensorPredictors/R/dist_kron_tr.R
@@ -0,0 +1,69 @@
+#' Trace of diffence/sum of left and right Kronecker product
+#'
+#'  tr(A1 %x% ... %x% Ar - B1 %x% ... %x% Br)
+#'
+#' or for `sign == +1` it computes
+#'
+#'  tr(A1 %x% ... %x% Ar + B1 %x% ... %x% Br)
+#'
+#' @examples
+#' A <- matrix(rnorm(5^2), 5)
+#' B <- matrix(rnorm(5^2), 5)
+#' stopifnot(all.equal(
+#'     dist.kron.tr(list(A), list(B)),
+#'     sum(diag(A - B))
+#' ))
+#' stopifnot(all.equal(
+#'     dist.kron.tr(list(A), list(B), +1),
+#'     sum(diag(A + B))
+#' ))
+#'
+#' A1 <- matrix(rnorm(5^2), 5)
+#' B1 <- matrix(rnorm(5^2), 5)
+#' A2 <- matrix(rnorm(7^2), 7)
+#' B2 <- matrix(rnorm(7^2), 7)
+#'
+#' stopifnot(all.equal(
+#'     dist.kron.tr(list(A1, A2), list(B1, B2), -1),
+#'     sum(diag(kronecker(A1, A2) - kronecker(B1, B2)))
+#' ))
+#'
+#' stopifnot(all.equal(
+#'     dist.kron.tr(list(A1, A2), list(B1, B2), +1),
+#'     sum(diag(kronecker(A1, A2) + kronecker(B1, B2)))
+#' ))
+#'
+#' p <- c(5, 3, 7, 2)
+#' As <- Map(function(pj) matrix(rnorm(pj^2), pj), p)
+#' Bs <- Map(function(pj) matrix(rnorm(pj^2), pj), p)
+#' stopifnot(all.equal(
+#'     dist.kron.tr(As, Bs),
+#'     sum(diag(Reduce(kronecker, As) - Reduce(kronecker, Bs)))
+#' ))
+#' stopifnot(all.equal(
+#'     dist.kron.tr(As, Bs, +1),
+#'     sum(diag(Reduce(kronecker, As) + Reduce(kronecker, Bs)))
+#' ))
+#'
+#' @export
+dist.kron.tr <- function(A, B, sign = -1) {
+    # base case: trace of the difference (or sum for `sign == +1`)
+    if ((is.matrix(A) || (length(A) == 1))
+    &&  (is.matrix(B) || (length(B) == 1))) {
+        if (is.list(A)) A <- A[[1]]
+        if (is.list(B)) B <- B[[1]]
+
+        return(sum(diag(A)) + sign * sum(diag(B)))
+    }
+
+    # recursion failguard
+    stopifnot(is.list(A) && is.list(B))
+
+    # Trace of A2 %x% A3 %x% ... %x% Ar and the same for B
+    trA <- unlist(Map(function(C) sum(diag(C)), A))
+    trB <- unlist(Map(function(C) sum(diag(C)), B))
+
+    # recursive case: split of left most matrices from Kronecker product
+    (sum(diag(A[[1]])) + sign * sum(diag(B[[1]]))) * dist.kron.tr(A[-1], B[-1], +1) -
+        trA[1] * prod(trB[-1]) - sign * trB[1] * prod(trA[-1])
+}
diff --git a/tensorPredictors/R/dist_subspace.R b/tensorPredictors/R/dist_subspace.R
index fdcfc81..8fcc9df 100644
--- a/tensorPredictors/R/dist_subspace.R
+++ b/tensorPredictors/R/dist_subspace.R
@@ -8,11 +8,11 @@
 #'  otherwise just \eqn{P_A = A A'} since \eqn{A' A} is the identity.
 #' @param normalize Boolean to specify if the distance shall be normalized.
 #'  Meaning, the maximal distance scaled to be \eqn{1} independent of dimensions.
-#' 
+#'
 #' @seealso
 #'  K. Ye and L.-H. Lim (2016) "Schubert varieties and distances between
 #'  subspaces of different dimensions" <arXiv:1407.0900>
-#' 
+#'
 #' @export
 dist.subspace <- function (A, B, is.ortho = FALSE, normalize = FALSE,
     tol = sqrt(.Machine$double.eps)
@@ -42,7 +42,7 @@ dist.subspace <- function (A, B, is.ortho = FALSE, normalize = FALSE,
         rankSum <- ncol(A) + ncol(B)
         c <- 1 / sqrt(max(1, min(rankSum, 2 * nrow(A) - rankSum)))
     } else {
-        c <- sqrt(2)
+        c <- 1
     }
 
     c * norm(PA - PB, type = "F")
diff --git a/tensorPredictors/R/exprs_all_equal.R b/tensorPredictors/R/exprs_all_equal.R
new file mode 100644
index 0000000..f569ab0
--- /dev/null
+++ b/tensorPredictors/R/exprs_all_equal.R
@@ -0,0 +1,64 @@
+
+#' Test if multiple expressions are (nearly) equal
+#'
+#' Convenience wrapper to [base::all.equal()] which is applied to each pairing
+#' of an expression to the first expresstion.
+#'
+#' @param exprs an unevaluated expression of the form
+#' ```
+#' {
+#'     expr1
+#'     expr2
+#'     ...
+#' }
+#' ```
+#' @param ... passed to [base::all.equal()]
+#' @param stopifnot boolean, if `TRUE` an error is thrown if [base::all.equal()]
+#'  does not evaluate to `TRUE` for any pairing.
+#'
+#' @returns `TRUE` or an error message.
+#'
+#' @examples
+#' exprs.all.equal({
+#'     matrix(rep(1, 6), 2, 3)
+#'     matrix(1, 2, 3)
+#'     array(rep(1, 6), dim = c(2, 3))
+#' })
+#' # basicaly identical to
+#' stopifnot(exprs = {
+#'     all.equal(matrix(rep(1, 6), 2, 3), matrix(1, 2, 3))
+#'     all.equal(matrix(rep(1, 6), 2, 3), array(rep(1, 6), dim = c(2, 3)))
+#' })
+#'
+#' @seealso [base::all.equal()]
+#'
+#' @export
+exprs.all.equal <- function(exprs, ..., stopifnot = TRUE) {
+    envir <- parent.frame()
+    exprs <- substitute(exprs)
+
+    # validate if there are at least 2 expressions to compare
+    if (!is.symbol(exprs[[1]]) || exprs[[1]] != quote(`{`) || length(exprs) < 3) {
+        stop("Only one 'exprs' or not a collection of expressions")
+    }
+
+    # reference value to compare all other expressions against
+    ref <- eval(exprs[[2]], envir = envir)
+
+    # compare reference against all the other expressions
+    for (i in seq.int(3, length(exprs), by = 1)) {
+        comp <- all.equal(ref, eval(exprs[[i]], envir = envir), ...)
+        # check `all.equal` for reference against current expression
+        if (!(is.logical(comp) && comp)) {
+            msg <- c(sprintf("Expr 1 `%s` and Expr %d `%s` are not equal:",
+                deparse(exprs[[2]]), i - 1, deparse(exprs[[i]])), comp)
+            if (stopifnot) {
+                stop(paste(msg, collapse = "\n"))
+            } else {
+                return(msg)
+            }
+        }
+    }
+
+    TRUE
+}
diff --git a/tensorPredictors/R/kpir_approx.R b/tensorPredictors/R/kpir_approx.R
index 691dfc3..944dda2 100644
--- a/tensorPredictors/R/kpir_approx.R
+++ b/tensorPredictors/R/kpir_approx.R
@@ -161,7 +161,7 @@ kpir.approx <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
         inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
 
         # backtracking loop
-        for (delta in step.size * 0.618034^seq.int(0L, length = max.line.iter)) {
+        for (delta in step.size * 0.618034^seq.int(0L, length.out = max.line.iter)) {
             # Update `alpha` and `beta` (note: add(+), the gradients are already
             # pointing into the negative slope direction of the loss cause they are
             # the gradients of the log-likelihood [NOT the negative log-likelihood])
diff --git a/tensorPredictors/R/kpir_ls.R b/tensorPredictors/R/kpir_ls.R
index fb5bcaa..96a4270 100644
--- a/tensorPredictors/R/kpir_ls.R
+++ b/tensorPredictors/R/kpir_ls.R
@@ -4,25 +4,38 @@
 #'
 #' @export
 kpir.ls <- function(X, Fy, max.iter = 20L, sample.axis = 1L,
-    eps = .Machine$double.eps, logger = NULL
+    eps = sqrt(.Machine$double.eps), center = TRUE, logger = NULL
 ) {
-    # Check if X and Fy have same number of observations
+    ### Step 0: Setup/Initialization
     if (!is.array(Fy)) {
         # scalar response case (add new axis of size 1)
-        dim(Fy) <- local({
-            dims <- rep(1, length(dim(X)))
-            dims[sample.axis] <- length(Fy)
-            dims
-        })
+        dim(Fy) <- ifelse(seq_along(dim(X)) == sample.axis, dim(X)[sample.axis], 1L)
+    }
+    # Check dimensions and matching of axis (tensor order)
+    stopifnot(exprs = {
+        length(dim(X)) == length(dim(Fy))
+        dim(X)[sample.axis] == dim(Fy)[sample.axis]
+    })
+    # warn about occurence of an axis without reduction
+    if (any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])) {
+        warning("Degenerate case 'any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])'")
     }
-    # Check dimensions
-    stopifnot(length(dim(X)) == length(dim(Fy)))
-    stopifnot(dim(X)[sample.axis] == dim(Fy)[sample.axis])
-    # and model constraints
-    stopifnot(all(dim(Fy) <= dim(X)))
 
     # mode index sequence (exclude sample mode, a.k.a. observation axis)
     modes <- seq_along(dim(X))[-sample.axis]
+    n <- dim(X)[sample.axis]                    # observation count (scalar)
+    p <- dim(X)[-sample.axis]                   # predictor dimensions (vector)
+
+    if (center) {
+        # Means for X and Fy (a.k.a. sum elements over the sample axis)
+        meanX <- apply(X, modes, mean, simplify = TRUE)
+        meanFy <- apply(Fy, modes, mean, simplify = TRUE)
+        # Center both X and Fy
+        X <- sweep(X, modes, meanX)
+        Fy <- sweep(Fy, modes, meanFy)
+    } else {
+        meanX <- meanFy <- NA
+    }
 
 
     ### Step 1: initial per mode estimates
@@ -30,24 +43,24 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.axis = 1L,
         La.svd(mcrossprod(X, mode = mode), ncol)$u
     }, modes, dim(Fy)[modes])
 
-    # Call history callback (logger) before the first iteration
-    if (is.function(logger)) { do.call(logger, c(0L, NA, rev(alphas))) }
 
-
-    ### Step 2: iterate per mode (axis) least squares estimates
+   ### Step 2: iterate per mode (axis) least squares estimates
     for (iter in seq_len(max.iter)) {
+ 
+        # Invoke logger for previous iterate
+        if (is.function(logger)) {
+            logger("ls", iter - 1L, alphas)
+        }
+ 
         # cyclic iterate over modes
         for (j in seq_along(modes)) {
             # least squares solution for `alpha_j | alpha_i, i != j`
             Z <- mlm(Fy, alphas[-j], modes = modes[-j])
-            alphas[[j]] <- t(solve(mcrossprod(Z, mode = modes[j]),
-                tcrossprod(mat(Z, modes[j]), mat(X, modes[j]))))
-            # TODO: alphas[[j]] <- t(solve(mcrossprod(Z, j), mcrossprod(Z, X, j)))
+            alphas[[j]] <- t(solve(
+                mcrossprod(Z, Z, modes[j]), mcrossprod(Z, X, modes[j])
+            ))
         }
 
-        # Call logger (invoke history callback)
-        if (is.function(logger)) { do.call(logger, c(iter, NA, rev(alphas))) }
-
         # TODO: add some kind of break condition
     }
 
@@ -56,8 +69,14 @@ kpir.ls <- function(X, Fy, max.iter = 20L, sample.axis = 1L,
     R <- X - mlm(Fy, alphas, modes = modes)
     # Moment estimates for `Delta_i`s
     Deltas <- Map(mcrossprod, list(R), mode = modes)
-    Deltas <- Map(`*`, 1 / dim(X)[sample.axis], Deltas)
+    Deltas <- Map(`*`, p / (n * prod(p)), Deltas)
+
+    # Call logger with final results (including Deltas)
+    if (is.function(logger)) {
+        logger("ls", iter, alphas, Deltas)
+    }
 
     list(alphas = structure(alphas, names = as.character(modes)),
-         Deltas = structure(Deltas, names = as.character(modes)))
+         Deltas = structure(Deltas, names = as.character(modes)),
+         meanX = meanX, meanFy = meanFy)
 }
diff --git a/tensorPredictors/R/kpir_mle.R b/tensorPredictors/R/kpir_mle.R
index 99cf0d6..cd39d46 100644
--- a/tensorPredictors/R/kpir_mle.R
+++ b/tensorPredictors/R/kpir_mle.R
@@ -1,8 +1,11 @@
 #' Per mode (axis) MLE
 #'
+#' @param sample.axis index of the sample mode, a.k.a. observation axis index
+#'
 #' @export
-kpir.mle <- function(X, Fy, max.iter = 500L, sample.axis = 1L,
-    logger = NULL #, eps = .Machine$double.eps
+kpir.mle <- function(X, Fy, sample.axis = 1L, center = TRUE,
+    max.iter = 50L, max.init.iter = 10L, eps = sqrt(.Machine$double.eps),
+    logger = NULL
 ) {
     ### Step 0: Setup/Initialization
     if (!is.array(Fy)) {
@@ -14,7 +17,7 @@ kpir.mle <- function(X, Fy, max.iter = 500L, sample.axis = 1L,
         length(dim(X)) == length(dim(Fy))
         dim(X)[sample.axis] == dim(Fy)[sample.axis]
     })
-    # warn about model constraints
+    # warn about occurence of an axis without reduction
     if (any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])) {
         warning("Degenerate case 'any(dim(Fy)[-sample.axis] >= dim(X)[-sample.axis])'")
     }
@@ -23,65 +26,74 @@ kpir.mle <- function(X, Fy, max.iter = 500L, sample.axis = 1L,
     modes <- seq_along(dim(X))[-sample.axis]    # predictor axis indices
     n <- dim(X)[sample.axis]                    # observation count (scalar)
     p <- dim(X)[-sample.axis]                   # predictor dimensions (vector)
-    # q <- dim(Fy)[-sample.axis]                  # response dimensions (vector)
-    # r <- length(dim(X)) - 1L                    # tensor order (scalar)
+    r <- length(dim(X)) - 1L                    # predictor rank (tensor order)
 
-    # Means for X and Fy (a.k.a. sum elements over the sample axis)
-    meanX <- apply(X, modes, mean, simplify = TRUE)
-    meanFy <- apply(Fy, modes, mean, simplify = TRUE)
-    # Center both X and Fy
-    X <- sweep(X, modes, meanX)
-    Fy <- sweep(Fy, modes, meanFy)
+    if (center) {
+        # Means for X and Fy (a.k.a. sum elements over the sample axis)
+        meanX <- apply(X, modes, mean, simplify = TRUE)
+        meanFy <- apply(Fy, modes, mean, simplify = TRUE)
+        # Center both X and Fy
+        X <- sweep(X, modes, meanX)
+        Fy <- sweep(Fy, modes, meanFy)
+    } else {
+        meanX <- meanFy <- NA
+    }
 
 
-    ### Step 1: Initial value estimation
-    alphas <- Map(function(mode, ncol) {
-        La.svd(mcrossprod(X, mode = mode), ncol)$u
-    }, modes, dim(Fy)[modes])
-    # Residuals
-    R <- X - mlm(Fy, alphas, modes = modes)
-    # Covariance Moment estimates
-    Deltas <- Map(mcrossprod, list(R), mode = modes)
-    Deltas <- Map(function(Delta, j) (n * prod(p[-j]))^(-1) * Delta,
-        Deltas, seq_along(Deltas))
-
-    # Call history callback (logger) before the first iteration
-    if (is.function(logger)) { do.call(logger, c(0L, NA, alphas, Deltas)) }
+    ### Step 1: Initial values
+    ls.fit <- kpir.ls(X, Fy, sample.axis = sample.axis, center = FALSE,
+        max.iter = max.init.iter, eps = eps, logger = logger)
+    alphas <- ls.fit$alphas
+    Deltas <- ls.fit$Deltas
+    # compute residuals
+    R <- X - mlm(Fy, alphas, modes)
+    # Compute covariance inverses
+    Delta.invs <- Map(solve, Deltas)
+    # multiply Deltas with alphas
+    Delta.inv.alphas <- Map(`%*%`, Delta.invs, alphas)
 
 
-    ### Step 2: Alternating estimate updates
+    ### Step 2: Iterative Updating
     for (iter in seq_len(max.iter)) {
-        # Compute covariance inverses
-        Deltas.inv <- Map(solve, Deltas)
 
-        # "standardize" X
-        Z <- mlm(X, Deltas.inv, modes = modes)
+        # Invoke logger for previous iterate
+        if (is.function(logger)) {
+            logger("mle", iter - 1L, alphas, Deltas)
+        }
 
-        # Compute new alpha estimates
-        alphas <- Map(function(j) {
-            # MLE estimate for alpha_j | alpha_k, Delta_l for all k != j and l
-            FF <- mlm(Fy, alphas[-j], modes = modes[-j])
-            Deltas[[j]] %*% t(solve(
-                t(mcrossprod(mlm(FF, Deltas.inv[-j], modes = modes[-j]), FF, mode = modes[j])),
-                t(mcrossprod(Z, FF, mode = modes[j]))))
-        }, seq_along(modes))
+        # random order cyclic updating
+        for (j in sample(2 * r)) {
+            # toggle between updating alpha j <= r and Delta j > r
+            if (j <= r) {
+                # Update `alpha_j`
+                XxDi  <- mlm(X,  Delta.invs[-j],       modes[-j])
+                Fxa   <- mlm(Fy,           alphas[-j], modes[-j])
+                FxDia <- mlm(Fy, Delta.inv.alphas[-j], modes[-j])
+                alphas[[j]] <- mcrossprod(XxDi, Fxa, modes[j]) %*%
+                    solve(mcrossprod(FxDia, Fxa, modes[j]))
 
-        # update residuals
-        R <- X - mlm(Fy, alphas, modes = modes)
+                # Recompute Residuals (with updated alpha)
+                R <- X - mlm(Fy, alphas, modes)
+            } else {
+                j <- j - r  # map from [r + 1; 2 r] to [1; r]
 
-        # next Delta estimates
-        Deltas <- Map(function(j) {
-            # MLE estimate for Delta_j | Delta_k, alpha_l for all k != j and l
-            (n * prod(p[-j]))^(-1) * mcrossprod(
-                mlm(R, Deltas[-j], modes = modes[-j]), R, mode = modes[j])
-        }, seq_along(modes))
+                # Update `Delta_j`
+                Deltas[[j]] <- (p[j] / (n * prod(p))) *
+                    mcrossprod(mlm(R, Delta.invs[-j], modes[-j]), R, modes[j])
 
+                # Recompute `Delta_j^-1`
+                Delta.invs[[j]] <- solve(Deltas[[j]])
+                # as well as `Delta_j^-1 alpha_j`
+                Delta.inv.alphas[[j]] <- Delta.invs[[j]] %*% alphas[[j]]
+            }
+        }
 
-        # TODO: break condition!!!
+        # TODO: add some kind of break condition
+    }
 
-
-        # Call history callback
-        if (is.function(logger)) { do.call(logger, c(iter, NA, alphas, Deltas)) }
+    # Before returning, call logger for the final iteration
+    if (is.function(logger)) {
+        logger("mle", iter, alphas, Deltas)
     }
 
     list(alphas = structure(alphas, names = as.character(modes)),
diff --git a/tensorPredictors/R/kpir_momentum.R b/tensorPredictors/R/kpir_momentum.R
index 0899ac0..d4e3c33 100644
--- a/tensorPredictors/R/kpir_momentum.R
+++ b/tensorPredictors/R/kpir_momentum.R
@@ -137,7 +137,7 @@ kpir.momentum <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
         inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
 
         # backtracking loop
-        for (delta in step.size * 0.618034^seq.int(0L, length = max.line.iter)) {
+        for (delta in step.size * 0.618034^seq.int(0L, length.out = max.line.iter)) {
             # Update `alpha` and `beta` (note: add(+), the gradients are already
             # pointing into the negative slope direction of the loss cause they are
             # the gradients of the log-likelihood [NOT the negative log-likelihood])
diff --git a/tensorPredictors/R/kpir_new.R b/tensorPredictors/R/kpir_new.R
index 7870357..9f56763 100644
--- a/tensorPredictors/R/kpir_new.R
+++ b/tensorPredictors/R/kpir_new.R
@@ -112,7 +112,7 @@ kpir.new <- function(X, Fy, shape = c(dim(X)[-1], dim(Fy[-1])),
         inner.prod <- sum(grad.alpha^2) + sum(grad.beta^2)
 
         # Line Search loop
-        for (delta in step.size * 0.618034^seq.int(0L, length = max.line.iter)) {
+        for (delta in step.size * 0.618034^seq.int(0L, length.out = max.line.iter)) {
             # Update `alpha` and `beta` (note: add(+), the gradients are already
             # pointing into the negative slope direction of the loss cause they are
             # the gradients of the log-likelihood [NOT the negative log-likelihood])
diff --git a/tensorPredictors/R/matricize.R b/tensorPredictors/R/matricize.R
index cb91e4d..1be504b 100644
--- a/tensorPredictors/R/matricize.R
+++ b/tensorPredictors/R/matricize.R
@@ -1,23 +1,94 @@
 #' Matricization
 #'
-#' @param T multi-dimensional array of order at least \code{mode}
-#' @param mode dimension along to matricize
+#' @param T multi-dimensional array
+#' @param modes axis indices along to matricize
+#' @param dims dimension of \code{T} befor matricization
+#' @param inv boolean to determin if the inverse operation should be performed
 #'
-#' @returns matrix of dimensions \code{dim(T)[mode]} by \code{prod(dim(T))[-mode]}
+#' @returns matrix of dimensions \code{dims[modes]} by \code{prod(dims)[-modes]}
+#'  or tensor of dimensions \code{dims} iff \code{inv} is true.
+#'
+#' @examples
+#' A <- array(rnorm(2 * 3 * 5), dim = c(2, 3, 5))
+#' stopifnot(exprs = {
+#'     all.equal(A, mat(mat(A, 1), 1, dim(A), TRUE))
+#'     all.equal(A, mat(mat(A, 2), 2, dim(A), TRUE))
+#'     all.equal(A, mat(mat(A, 3), 3, dim(A), TRUE))
+#'     all.equal(A, mat(mat(A, c(1, 2)), c(1, 2), dim(A), TRUE))
+#'     all.equal(A, mat(mat(A, c(1, 3)), c(1, 3), dim(A), TRUE))
+#'     all.equal(A, mat(mat(A, c(2, 3)), c(2, 3), dim(A), TRUE))
+#'
+#'     all.equal(t(mat(A, 1)), mat(A, c(2, 3)))
+#'     all.equal(t(mat(A, 3)), mat(A, c(1, 2)))
+#' })
+#'
+#' stopifnot(all.equal(
+#'     mat(1:12, 2, dims = c(2, 3, 2)),
+#'     matrix(c(
+#'         1,  2,  7,  8,
+#'         3,  4,  9, 10,
+#'         5,  6, 11, 12
+#'     ), 3, 4, byrow = TRUE)
+#' ))
 #'
 #' @export
-mat <- function(T, mode) {
-    mode <- as.integer(mode)
+mat <- function(T, modes, dims = dim(T), inv = FALSE) {
+    modes <- as.integer(modes)
 
-    dims <- dim(T)
-    if (length(dims) < mode) {
-        stop("Mode must be a pos. int. smaller equal than the tensor order")
+    stopifnot(exprs = {
+        length(T) == prod(dims)
+        all(modes <= length(dims))
+    })
+
+    perm <- c(modes, seq_along(dims)[-modes])
+    if (inv) {
+        dim(T) <- dims[perm]
+        perm <- order(perm)
+    } else {
+        dim(T) <- dims
     }
 
-    if (mode > 1L) {
-        T <- aperm(T, c(mode, seq_along(dims)[-mode]))
+    T <- aperm(T, perm)
+
+    if (inv) {
+        dim(T) <- dims
+    } else {
+        dim(T) <- c(prod(dims[modes]), prod(dims[-modes]))
     }
-    dim(T) <- c(dims[mode], prod(dims[-mode]))
 
     T
 }
+
+
+# #' Inverse Matricization
+# #'
+# #' @param T matrix of dimensions \code{dims[mode]} by \code{prod(dims[-mode])}
+# #' @param mode axis along the original matricization
+# #' @param dims dimension of the original tensor
+# #'
+# #' @returns multi-dimensional array of dimensions \code{dims}
+# #'
+# #' @examples
+# #' p <- c(2, 3, 5)
+# #' A <- array(rnorm(prod(p)), dim = p)
+# #' stopifnot(expr = {
+# #'     all.equal(A, mat.inv(mat(A, 1), 1, p))
+# #'     all.equal(A, mat.inv(mat(A, 2), 2, p))
+# #'     all.equal(A, mat.inv(mat(A, 3), 3, p))
+# #' })
+# #'
+# #' @export
+# mat.inv <- function(T, modes, dims) {
+#     modes <- as.integer(modes)
+
+#     stopifnot(exprs = {
+#         length(T) == prod(dims)
+#         any(length(dims) < modes)
+#     })
+
+#     dim(T) <- c(dims[modes], dims[-modes])
+#     T <- aperm(T, order(c(modes, seq_along(dims)[-modes])))
+#     dim(T) <- c(prod(dims[modes]), prod(dims[-modes]))
+
+#     T
+# }
diff --git a/tensorPredictors/R/mcov.R b/tensorPredictors/R/mcov.R
new file mode 100644
index 0000000..3e37222
--- /dev/null
+++ b/tensorPredictors/R/mcov.R
@@ -0,0 +1,45 @@
+#' Mode wise Covariance Estimates
+#'
+#' Estimates Covariances \eqn{\Sigma_k}{Sigma_k} for each mode \eqn{k}.
+#' This is equivalent to assuming a Kronecker structured Covariance
+#'
+#' \deqn{\Sigma = \Sigma_r\otimes ... \otimes\Sigma_1}{%
+#'       Sigma = Sigma_r %x% ... %x% Sigma_1}
+#'
+#' where \eqn{\Sigma}{Sigma} is the Covariance \eqn{cov(vec(X))} of the
+#' vectorized variables. This function estimates the Kronecerk components
+#' \eqn{\Sigma_k}{Sigma_k}.
+#'
+#' @param X multi-dimensional array
+#' @param sample.axis observation axis index
+#'
+#' @export
+mcov <- function(X, sample.axis = 1L) {
+    # observation modes (axis indices)
+    modes <- seq_along(dim(X))[-sample.axis]
+    # observation dimensions
+    p <- dim(X)[modes]
+    # observation tensor order
+    r <- length(p)
+
+    # ensure observations are on the last mode
+    if (sample.axis != r + 1L) {
+        X <- aperm(X, c(modes, sample.axis))
+    }
+    # centering: Z = X - E[X]
+    Z <- X - c(rowMeans(X, dims = r))
+
+    # estimes (unscaled) covariances for each mode
+    Sigmas <- .mapply(mcrossprod, list(mode = seq_len(r)), MoreArgs = list(Z))
+    # scale by per mode "sample" size
+    Sigmas <- .mapply(`*`, list(Sigmas, p / prod(dim(X))), NULL)
+
+    # estimate trace of Kronecker product of covariances
+    tr.est <- prod(p) * mean(Z^2)
+
+    # as well as the current trace of the unscaled covariances
+    tr.Sigmas <- prod(unlist(.mapply(function(S) sum(diag(S)), list(Sigmas), NULL)))
+
+    # Scale each mode Covariance to match the estimated Kronecker product scale
+    .mapply(`*`, list(Sigmas), MoreArgs = list((tr.est / tr.Sigmas)^(1 / r)))
+}
diff --git a/tensorPredictors/R/mcrossprod.R b/tensorPredictors/R/mcrossprod.R
index cf233a9..67282de 100644
--- a/tensorPredictors/R/mcrossprod.R
+++ b/tensorPredictors/R/mcrossprod.R
@@ -1,18 +1,19 @@
 #' Tensor Mode Crossproduct
 #'
-#'      C = A_(m) t(A_(m))
+#'      C = A_(m) t(B_(m))
 #'
-#' For a matrix `A`, the first mode is `mcrossprod(A, 1)` equivalent to
-#' `A %*% t(A)` (`tcrossprod`). On the other hand for mode two `mcrossprod(A, 2)`
-#' the equivalence is `t(A) %*% A` (`crossprod`).
+#' For a matrices `A`, `B`, the first mode is `mcrossprod(A, B, 1)` equivalent
+#' to `A %*% t(B)` (`tcrossprod`). On the other hand for mode two
+#' `mcrossprod(A, 2)` the equivalence is `t(A) %*% B` (`crossprod`).
 #'
 #' @param A multi-dimensional array
+#' @param B multi-dimensional array (allowed missing, defaults to `A`)
 #' @param mode index (1-indexed)
 #'
-#' @returns matrix of dimensions \code{dim(A)[mode] by dim(A)[mode]}.
+#' @returns matrix of dimensions \code{dim(A)[mode] by dim(B)[mode]}.
 #'
-#' @note equivalent to \code{tcrossprod(mat(A, mode))} with around the same
-#'  performance but only allocates the result matrix.
+#' @note equivalent to \code{tcrossprod(mat(A, mode), mat(B, mode))} with around
+#'  the same performance but only allocates the result matrix.
 #'
 #' @examples
 #' dimA <- c(2, 5, 7, 11)
@@ -43,12 +44,18 @@
 #' ))
 #'
 #' @export
-mcrossprod <- function(A, B, mode = length(dim(A))) {
+mcrossprod <- function(A, B, mode) {
     storage.mode(A) <- "double"
+    if (is.null(dim(A))) {
+        dim(A) <- length(A)
+    }
     if (missing(B)) {
         .Call("C_mcrossprod_sym", A, as.integer(mode))
     } else {
         storage.mode(B) <- "double"
+        if (is.null(dim(B))) {
+            dim(B) <- length(B)
+        }
         .Call("C_mcrossprod", A, B, as.integer(mode))
     }
 }
diff --git a/tensorPredictors/R/mkm.R b/tensorPredictors/R/mkm.R
new file mode 100644
index 0000000..8a2f6b4
--- /dev/null
+++ b/tensorPredictors/R/mkm.R
@@ -0,0 +1,31 @@
+#' Multi Kronecker Multiplication
+#'
+#' \deqn{C = A (B_1\otimes ...\otimes B_r)}{%
+#'       C = A (B_1 %x% ... %x% B_r)}
+#'
+#' @examples
+#' n <- 17
+#' p <- c(2, 7, 11)
+#' q <- c(3, 5, 13)
+#'
+#' A <- matrix(rnorm(n * prod(p)), n)
+#' Bs <- Map(matrix, Map(rnorm, p * q), p)
+#'
+#' stopifnot(all.equal(
+#'     A %*% Reduce(`%x%`, Bs),
+#'     mkm(A, Bs)
+#' ))
+#'
+#' @export
+mkm <- function(A, Bs) {
+    # reshape
+    dim(A) <- c(nrow(A), rev(mapply(nrow, Bs)))
+
+    # perform equiv Multi Linear Multiplication
+    C <- mlm(A, rev(Bs), seq_along(Bs) + 1, transposed = TRUE)
+
+    # reshape back
+    dim(C) <- c(nrow(C), prod(dim(C)[-1]))
+
+    C
+}
diff --git a/tensorPredictors/R/mlm.R b/tensorPredictors/R/mlm.R
index c73b1b5..1f8cee4 100644
--- a/tensorPredictors/R/mlm.R
+++ b/tensorPredictors/R/mlm.R
@@ -1,33 +1,32 @@
 #' Multi Linear Multiplication
 #'
-#'     C = A x { B1, ..., Br }
+#' \deqn{C\times\{ B_1, ..., B_r \}}{%
+#'       C = A x { B1, ..., Br }}
 #'
 #' @param A tensor (multi-linear array)
-#' @param B matrix or list of matrices
-#' @param ... further matrices, concatenated with \code{B}
-#' @param modes integer sequence of the same length as number of matrices
-#'  supplied (in \code{B} and \code{...})
+#' @param Bs matrix or list of matrices
+#' @param modes integer sequence of the same length as `Bs` specifying the
+#'  multiplication axis (defaults to `seq_along(Bs)`)
+#' @param transposed single boolean or boolean vector of same length as \code{Bs}
+#'  to transpose the \code{Bs} of matching index before multiplication.
 #'
 #' @examples
 #' # general usage
 #' dimA <- c(3, 17, 19, 2)
 #' dimC <- c(7, 11, 13, 5)
 #' A <- array(rnorm(prod(dimA)), dim = dimA)
-#' B <- Map(function(p, q) matrix(rnorm(p * q), p, q), dimC, dimA)
-#' C1 <- mlm(A, B)
-#' C2 <- mlm(A, B[[1]], B[[2]], B[[3]], B[[4]])
-#' C3 <- mlm(A, B[[3]], B[[1]], B[[2]], B[[4]], modes = c(3, 1, 2, 4))
-#' C4 <- mlm(A, B[1:3], B[[4]])
+#' Bs <- Map(function(p, q) matrix(rnorm(p * q), p, q), dimC, dimA)
+#' C1 <- mlm(A, Bs)
+#' C2 <- mlm(A, Bs)
+#' C3 <- mlm(A, Bs[c(3, 1, 2, 4)], modes = c(3, 1, 2, 4))
 #' stopifnot(all.equal(C1, C2))
 #' stopifnot(all.equal(C1, C3))
-#' stopifnot(all.equal(C1, C4))
 #'
 #' # selected modes
-#' C1 <- mlm(A, B[2:3], modes = 2:3)
-#' C2 <- mlm(A, B[[2]], B[[3]], modes = 2:3)
-#' C3 <- ttm(ttm(A, B[[2]], 2), B[[3]], 3)
-#' stopifnot(all.equal(C1, C2))
-#' stopifnot(all.equal(C1, C3))
+#' stopifnot(all.equal(
+#'     mlm(A, Bs[2:3], modes = 2:3),
+#'     ttm(ttm(A, Bs[[2]], 2), Bs[[3]], 3)
+#' ))
 #'
 #' # analog to matrix multiplication
 #' A <- matrix(rnorm( 6), 2, 3)
@@ -38,6 +37,16 @@
 #'     mlm(B, list(A, C))
 #' ))
 #'
+#' # usage of transposed
+#' A <- matrix(rnorm( 6), 2, 3)
+#' B <- matrix(rnorm(15), 3, 5)
+#' C <- matrix(rnorm(35), 5, 7)
+#'
+#' stopifnot(all.equal(
+#'     A %*% B %*% C,
+#'     mlm(B, list(A, C), transposed = c(FALSE, TRUE))
+#' ))
+#'
 #' # usage with repeated modes (non commutative)
 #' dimA <- c(3, 17, 19, 2)
 #' A <- array(rnorm(prod(dimA)), dim = dimA)
@@ -46,17 +55,17 @@
 #' C <- matrix(rnorm(4), 2, 2)
 #' # same modes do NOT commute
 #' all.equal(
-#'     mlm(A, B1, B2, C, modes = c(1, 1, 4)),  # NOT equal!
-#'     mlm(A, B2, B1, C, modes = c(1, 1, 4))
+#'     mlm(A, list(B1, B2, C), c(1, 1, 4)),  # NOT equal!
+#'     mlm(A, list(B2, B1, C), c(1, 1, 4))
 #' )
 #' # but different modes do commute
-#' P1 <- mlm(A, C, B1, B2, modes = c(4, 1, 1))
-#' P2 <- mlm(A, B1, C, B2, modes = c(1, 4, 1))
-#' P3 <- mlm(A, B1, B2, C, modes = c(1, 1, 4))
+#' P1 <- mlm(A, list(C, B1, B2), c(4, 1, 1))
+#' P2 <- mlm(A, list(B1, C, B2), c(1, 4, 1))
+#' P3 <- mlm(A, list(B1, B2, C), c(1, 1, 4))
 #' stopifnot(all.equal(P1, P2))
 #' stopifnot(all.equal(P1, P3))
 #'
-#' Concatination of MLM is MLM
+#' # Concatination of MLM is MLM
 #' dimX <- c(4, 8, 6, 3)
 #' dimA <- c(3, 17, 19, 2)
 #' dimB <- c(7, 11, 13, 5)
@@ -67,13 +76,20 @@
 #' all.equal(mlm(mlm(X, As), Bs), mlm(X, Map(`%*%`, Bs, As)))
 #'
 #' @export
-mlm <- function(A, B, ..., modes = seq_along(B)) {
+mlm <- function(A, Bs, modes = seq_along(Bs), transposed = FALSE) {
     # Collect all matrices in `B`
-    B <- c(if (is.matrix(B)) list(B) else B, list(...))
+    Bs <- if (is.matrix(Bs)) list(Bs) else Bs
+
+    # replicate transposition if of length one only
+    transposed <- if (length(transposed) == 1) {
+        rep(as.logical(transposed), length(Bs))
+    } else {
+        as.logical(transposed)
+    }
 
     # iteratively apply Tensor Times Matrix multiplication over modes
     for (i in seq_along(modes)) {
-        A <- ttm(A, B[[i]], modes[i])
+        A <- ttm(A, Bs[[i]], modes[i], transposed[i])
     }
 
     # return result tensor
diff --git a/tensorPredictors/R/mtvk.R b/tensorPredictors/R/mtvk.R
new file mode 100644
index 0000000..4acf546
--- /dev/null
+++ b/tensorPredictors/R/mtvk.R
@@ -0,0 +1,63 @@
+#' Matrix Times Vectorized Kronecker product
+#'
+#' \deqn{C = A vec(B_1\otimes ... \otimes B_r)}{%
+#'       C = A vec(B_1 %x% ... %x% B_r)}
+#'
+#' This function is equivalent to `c(A %*% c(Reduce("%x%", Bs)))`.
+#'
+#' @param A matrix of dimensions `n` by `pp * qq`
+#' @param Bs list of matrices such that the product there Kronecker product has
+#'  dimensions `pp` by `qq`.
+#'
+#' @returns vector of length `n`
+#'
+#' @examples
+#' n <- 17
+#' p <- c(2, 5, 11)
+#' q <- c(3, 7, 13)
+#'
+#' A <- matrix(rnorm(n * prod(p * q)), n)
+#' Bs <- Map(matrix, Map(rnorm, p * q), p)
+#'
+#' stopifnot(all.equal(
+#'     c(A %*% c(Reduce(`%x%`, Bs))),
+#'     mtvk(A, Bs)
+#' ))
+#'
+#' @note May be slower than `c(A %*% c(Reduce("%x%", Bs)))`.
+#' @TODO C++ version using Rcpp is much faster than plain C using `R`s C API!
+#'
+#' @export
+mtvk <- function(A, Bs) {
+    c(A %*% c(Reduce("%x%", Bs)))
+    # storage.mode(A) <- "double"
+    # Bs <- Map(`storage.mode<-`, Bs, list("double"))
+    # .Call("C_mtvk", A, Bs)
+}
+
+
+
+# n <- 17
+# p <- rev(c(11, 7, 20))
+# q <- rev(c(13, 5, 30))
+# r <- length(p)
+
+# A <- matrix(rnorm(n * prod(p * q)), n)
+# Bs <- Map(matrix, Map(rnorm, p * q), p)
+
+# microbenchmark::microbenchmark(
+#     A %*% c(Reduce("%x%", Bs)),
+#     mtvk(A, Bs)
+# )
+
+
+# gcc -I"/usr/share/R/include" -DNDEBUG -fpic -g -O2 -fdebug-prefix-map=/build/r-base-zYgbYq/r-base-4.2.1=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -c mtvk.c   -o mtvk.o
+# g++ -I"/usr/share/R/include" -DNDEBUG -fpic -g -O2 -fdebug-prefix-map=/build/r-base-zYgbYq/r-base-4.2.1=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -c mtvk.cpp -o mtvk.o \
+#     -std=gnu++14 -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/home/loki/Work/tensorPredictors/wip"
+
+
+# g++ -shared -L/usr/lib/R/lib -Wl,-Bsymbolic-functions -Wl,-z,relro -o sourceCpp_2.so mtvk.o -L/usr/lib/R/lib -lR \
+#     -std=gnu++14
+
+# gcc -shared -L/usr/lib/R/lib -Wl,-Bsymbolic-functions -Wl,-z,relro -o tensorPredictors.so -L/usr/lib/R/lib -lR \
+#     init.o mcrossprod.o mtvk.o poi.o ttm.o -lblas -lgfortran -lm -lquadmath
diff --git a/tensorPredictors/R/num_deriv.R b/tensorPredictors/R/num_deriv.R
new file mode 100644
index 0000000..b5a513e
--- /dev/null
+++ b/tensorPredictors/R/num_deriv.R
@@ -0,0 +1,21 @@
+#' Numeric differentiation
+#'
+#' @example inst/examples/num_deriv.R
+#'
+#' @export
+num.deriv <- function(F, X, h = 1e-6, sym = FALSE) {
+    if (sym) {
+        stopifnot(isSymmetric(X))
+        p <- nrow(X)
+        k <- seq_along(X) - 1
+        mapply(function(i, j) {
+            dx <- h * ((k == i * p + j) | (k == j * p + i))
+            (F(X + dx) - F(X - dx)) / (2 * h)
+        }, .row(dim(X)) - 1, .col(dim(X)) - 1)
+    } else {
+        sapply(seq_along(X), function(i) {
+            dx <- h * (seq_along(X) == i)
+            (F(X + dx) - F(X - dx)) / (2 * h)
+        })
+    }
+}
diff --git a/tensorPredictors/R/patternMatrices.R b/tensorPredictors/R/patternMatrices.R
new file mode 100644
index 0000000..69ca1e1
--- /dev/null
+++ b/tensorPredictors/R/patternMatrices.R
@@ -0,0 +1,139 @@
+#' Duplication Matrix
+#'
+#' Matrix such that `vec(A) = D vech(A)` for `A` symmetric
+#' @examples
+#' p <- 8
+#' A <- matrix(rnorm(p^2), p, p)
+#' A <- A + t(A)
+#' stopifnot(all.equal(c(D(nrow(A)) %*% vech(A)), c(A)))
+#'
+#' @export
+D <- function(p) {
+    # setup `vec` and `vech` element indices (zero indexed)
+    vec <- matrix(NA_integer_, p, p)
+    vec[lower.tri(vec, diag = TRUE)] <- seq_len(p * (p + 1) / 2)
+    vec[upper.tri(vec)] <- t(vec)[upper.tri(vec)]
+    vech <- matrix(vec)[lower.tri(vec, diag = TRUE)]
+
+    # construct duplication matrix
+    Dp <- outer(c(vec), c(vech), `==`)
+    storage.mode(Dp) <- "double"
+    Dp
+}
+
+#' Pseudo Inverse of the Duplication Matrix
+#'
+#' Matrix such that `vech(A) = D^+ vec(A)` for `A` symmetric
+#'
+#' @examples
+#' p <- 5
+#' stopifnot(all.equal(D(p) %*% D.pinv(p), N(p)))
+#'
+#' @export
+D.pinv <- function(p) {
+    Dp <- D(p)
+    solve(crossprod(Dp), t(Dp))
+}
+
+#' Generalized Commutation Permutation
+#'
+#' Equivalent permutation to the Commutation matrix.
+#'
+#' @examples
+#' A <- array(rnorm(105), c(28, 7, 11))
+#' stopifnot(expr = {
+#'     all.equal(c(A)[K.perm(dim(A), 1)], c(A))
+#'     all.equal(c(A)[K.perm(dim(A), 2)], c(K(dim(A), 2) %*% c(A)))
+#'     all.equal(c(A)[K.perm(dim(A), 3)], c(K(dim(A), 3) %*% c(A)))
+#' })
+#'
+#' @export
+K.perm <- function(dim, mode) {
+    # special case of classic commutation matrix `K(p, q) == K(c(p, q), 2)`
+    if (length(dim) == 1) {
+        dim <- c(dim, mode)
+        mode <- 2
+    }
+
+    # construct permutation
+    c(aperm(array(seq_len(prod(dim)), dim), c(mode, seq_len(length(dim))[-mode])))
+}
+
+#' Generalized Commutation Matrix
+#'
+#'      K_p,(m) vec(A) = vec(A_(m))
+#'
+#' for an `p` dimensional array `A`. The special case of the commutation matrix
+#' is given by `K(p, q)` for the matrix dimensions `p by q`.
+#'
+#' @examples
+#' # Special case of matrices
+#' stopifnot(all.equal(K(5, 7), K(c(5, 7), 2)))
+#' A <- matrix(rnorm(35), 5, 7)
+#' stopifnot(all.equal(c(K(5, 7) %*% c(A)), c(t(A))))
+#'
+#' # Kronecker commutation identity
+#' A <- matrix(rnorm(3 * 7), 3, 7)
+#' B <- matrix(rnorm(5 * 2), 5, 2)
+#' stopifnot(all.equal(
+#'     B %x% A,
+#'     K(nrow(B), nrow(A)) %*% (A %x% B) %*% K(ncol(A), ncol(B))
+#' ))
+#'
+#' # General case for tensors
+#' A <- array(rnorm(105), c(28, 7, 11))
+#' stopifnot(all.equal(K(dim(A), 1), diag(prod(dim(A)))))
+#' stopifnot(all.equal(c(K(dim(A), 2) %*% c(A)), c(mat(A, 2))))
+#' stopifnot(all.equal(c(K(dim(A), 3) %*% c(A)), c(mat(A, 3))))
+#'
+#' # Generalized Kronecker Product Commutation Identity
+#' p <- c(4, 2, 3, 4)
+#' q <- c(3, 4, 2, 5)
+#' A <- mapply(function(p_i, q_i) {
+#'     matrix(rnorm(p_i * q_i), p_i, q_i)
+#' }, p, q)
+#' for (mode in seq_along(A)) {
+#'     stopifnot(all.equal(
+#'         Reduce(`%x%`, A[c(rev(seq_along(A)[-mode]), mode)]),
+#'         K(p, mode) %*% Reduce(`%x%`, rev(A)) %*% t(K(q, mode))
+#'     ))
+#' }
+#'
+#' @export
+K <- function(dim, mode) {
+    # special case of classic commutation matrix `K(p, q) == K(c(p, q), 2)`
+    if (length(dim) == 1) {
+        dim <- c(dim, mode)
+        mode <- 2
+    }
+
+    # construct permutation
+    perm <- aperm(array(seq_len(prod(dim)), dim), c(mode, seq_len(length(dim))[-mode]))
+    # commutation matrix
+    diag(prod(dim))[perm, ]
+}
+
+#' Symmetrizer Matrix
+#'
+#'      N_p vec(A) = 1/2 (vec(A) + vec(A'))
+#'
+#' @examples
+#' p <- 7
+#' stopifnot(all.equal(N(p), D(p) %*% D.pinv(p)))
+#'
+#' @export
+N <- function(p) {
+    0.5 * (diag(p^2) + K(p, p))
+}
+
+#' Selection Matrix
+#'
+#' Selects the diagonal elements of a matrix vectorization
+#'
+#' @export
+S <- function(p) {
+    index <- matrix(seq_len(p^2), p, p)
+    s <- outer(diag(index), c(index), `==`)
+    storage.mode(s) <- "integer"
+    s
+}
diff --git a/tensorPredictors/R/rtensornorm.R b/tensorPredictors/R/rtensornorm.R
index 9c0de75..46b2356 100644
--- a/tensorPredictors/R/rtensornorm.R
+++ b/tensorPredictors/R/rtensornorm.R
@@ -4,13 +4,10 @@
 #' n <- 1000
 #' Sigma.1 <- 0.5^abs(outer(1:5, 1:5, "-"))
 #' Sigma.2 <- diag(1:4)
-#' X <- rtensornorm(n, 0, Sigma.1, Sigma.2)
+#' X <- rtensornorm(n, 0, list(Sigma.1, Sigma.2))
 #'
 #' @export
-rtensornorm <- function(n, mean, ..., sample.axis) {
-    # get covariance matrices
-    cov <- list(...)
-
+rtensornorm <- function(n, mean, cov, sample.axis = 1L) {
     # get sample shape (dimensions of an observation)
     shape <- unlist(Map(nrow, cov))
 
@@ -34,11 +31,11 @@ rtensornorm <- function(n, mean, ..., sample.axis) {
     }
 
     # add mean (using recycling, observations on last mode)
-    X <- X + mean
+    X <- X + c(mean)
 
-    # permute axis for indeing observations on sample mode (permute first axis
+    # permute axis for indexing observations on sample mode (permute first axis
     # with sample mode axis)
-    if (!missing(sample.axis)) {
+    if (sample.axis != length(dims)) {
         axis <- seq_len(length(dims) - 1)
         start <- seq_len(sample.axis - 1)
         end <- seq_len(length(dims) - sample.axis) + sample.axis - 1
diff --git a/tensorPredictors/R/ttm.R b/tensorPredictors/R/ttm.R
index 5b3c8a4..ce7bdc8 100644
--- a/tensorPredictors/R/ttm.R
+++ b/tensorPredictors/R/ttm.R
@@ -2,16 +2,19 @@
 #'
 #' @param T array of order at least \code{mode}
 #' @param M matrix, the right hand side of the mode product such that
-#'  \code{ncol(M)} equals \code{dim(T)[mode]}
+#'  \code{ncol(M)} equals \code{dim(T)[mode]} if \code{transposed} is false,
+#'  otherwise the dimension matching is \code{nrow(M)} to \code{dim(T)[mode]}.
 #' @param mode the mode of the product in the range \code{1:length(dim(T))}
+#' @param transposed boolean to multiply with the transposed of \code{M}
 #'
 #' @returns multi-dimensional array of the same order as \code{T} with
-#'  \code{mode} dimension equal to \code{nrow(M)}
+#'  \code{mode} dimension equal to \code{nrow(M)} or \code{ncol(M)} if
+#'  \code{transposed} is true.
 #'
 #' @export
-ttm <- function(T, M, mode = length(dim(T))) {
+ttm <- function(T, M, mode = length(dim(T)), transposed = FALSE) {
     storage.mode(T) <- storage.mode(M) <- "double"
-    .Call("C_ttm", T, M, as.integer(mode))
+    .Call("C_ttm", T, M, as.integer(mode), as.logical(transposed))
 }
 
 #' @rdname ttm
diff --git a/tensorPredictors/R/vech.R b/tensorPredictors/R/vech.R
new file mode 100644
index 0000000..7899944
--- /dev/null
+++ b/tensorPredictors/R/vech.R
@@ -0,0 +1,33 @@
+#' Half vectorization of a matrix (lower part)
+#' @export
+vech <- function(A) A[lower.tri(A, diag = TRUE)]
+
+#' @export
+vech.index <- function(p) which(.row(c(p, p)) >= .col(c(p, p)))
+
+#' @export
+vech.pinv.index <- function(p) {
+    index <- matrix(NA_integer_, p, p)
+    index[lower.tri(index, diag = TRUE)] <- seq_len(p * (p + 1L) / 2L)
+    index[upper.tri(index)] <- t(index)[upper.tri(index)]
+
+    index
+}
+
+#' pseudo inserse of the half vectorization
+#'
+#' @examples
+#' # only valid for symmetric matrices
+#' A <- matrix(rnorm(4^2), 4)
+#' A <- A + t(A)
+#' all.equal(A, vech.pinv(vech(A)))
+#'
+#' @export
+vech.pinv <- function(a) {
+    # determin original dimensions
+    p <- as.integer((sqrt(8 * length(a) + 1) - 1) / 2)
+    stopifnot(p * (p + 1L) == 2L * length(a))
+
+    # de-vectorized matrix
+    matrix(a[vech.pinv.index(p)], p, p)
+}
diff --git a/tensorPredictors/inst/examples/ICU.R b/tensorPredictors/inst/examples/ICU.R
new file mode 100644
index 0000000..9790c7f
--- /dev/null
+++ b/tensorPredictors/inst/examples/ICU.R
@@ -0,0 +1,31 @@
+# Polynomial f(x, y) = 5 x^2 - 6x y + 5 y^2
+fun <- function(x) {
+    5 * x[1]^2 - 6 * x[1] * x[1] + 5 * x[2]^2
+}
+# Update, df(x, y)/dx = 0  =>  x = 6 y / 10 for y fixed
+# The same the other way arround
+update <- function(x, i) {
+    (6 / 10) * if (i == 1) x[2] else x[1]
+}
+# call with initial values (x, y) = (-0.5, -1)
+stopifnot(all.equal(
+    ICU(fun, update, c(-0.5, -1)),
+    c(0, 0) # known minimum
+))
+
+# Same problem as above but with a list of parameters
+fun <- function(params) {
+    5 * params$x^2 - 6 * params$x * params$x + 5 * params$y^2
+}
+# Update, df(x, y)/dx = 0  =>  x = 6 y / 10 for y fixed
+# The same the other way arround
+update <- function(params, i) {
+    (6 / 10) * if (i == 1) params$y else params$x
+}
+# and a callback
+callback <- function(iter, params) {
+    cat(sprintf("%3d - fun(%7.4f, %7.4f) = %6.4f\n",
+        iter, params$x, params$y, fun(params)))
+}
+# call with initial values (x, y) = (-0.5, -1)
+ICU(fun, update, list(x = -0.5, y = -1), callback = callback)
diff --git a/tensorPredictors/inst/examples/NAGD.R b/tensorPredictors/inst/examples/NAGD.R
new file mode 100644
index 0000000..9a52f5f
--- /dev/null
+++ b/tensorPredictors/inst/examples/NAGD.R
@@ -0,0 +1,70 @@
+# Rosenbrock function for x in R^2
+fun <- function(x, a = 1, b = 100) {
+    (a - x[1])^2 + b * (x[2] - x[1]^2)^2
+}
+# Gradient of the Rosenbrock function
+grad <- function(x, a = 1, b = 100) {
+    2 * c(x[1] - a - b * x[1] * (x[2] - x[1]^2), b * (x[2] - x[1]^2))
+}
+# call with initial values (x, y) = (-1, 1)
+stopifnot(all.equal(
+    NAGD(fun, grad, c(-1, 1), max.iter = 500L),
+    c(1, 1) # known minimum
+))
+
+# Equivalent to above, but the parameters are in a list
+fun <- function(params, a = 1, b = 100) {
+    (a - params$x)^2 + b * (params$y - params$x^2)^2
+}
+grad <- function(params, a = 1, b = 100) list(
+    x = 2 * (params$x - a - b * params$x * (params$y - params$x^2)),
+    y = 2 * b * (params$y - params$x^2)
+)
+# need to tell NAGD how to combine parameters
+lincomb <- function(a, LHS, b, RHS) list(
+    x = a * LHS$x + b * RHS$x,
+    y = a * LHS$y + b * RHS$y
+)
+# and how to compute there norm (squared)
+norm2 <- function(params) {
+    sum(unlist(params)^2)
+}
+# callback invoced for each update
+callback <- function(iter, params) {
+    cat(sprintf("%3d - fun(%7.4f, %7.4f) = %6.4f\n",
+        iter, params$x, params$y, fun(params)))
+}
+# call with initial values (x, y) = (-1, 1)
+fit <- NAGD(fun, grad, list(x = -1, y = 1),
+            fun.lincomb = lincomb, fun.norm2 = norm2,
+            callback = callback)
+
+# Weighted Least Squares for Heterosgedastic Data
+# Predictors
+x <- rnorm(500)
+# "True" parameters
+beta <- c(intercept = 1, slope = 0.5)
+# Model matrix
+X <- cbind(1, x)
+# response + heterosgedastic noise
+y <- X %*% beta + sqrt(x - min(x) + 0.1) * rnorm(length(x))
+
+loss <- function(beta, w) {
+    sum((y - X %*% beta)^2 * w)
+}
+weights <- function(beta, w, delta = 1e-3) {
+    1 / pmax(abs(y - X %*% beta), delta)
+}
+grad <- function(beta, w) {
+    -2 * crossprod(X, (y - X %*% beta) * w)
+}
+
+fit <- NAGD(loss, grad, coef(lm(y ~ x)), more.params = 1, fun.more.params = weights)
+
+# # plot the data
+# plot(x, y)
+# abline(beta[1], beta[2], col = "black", lty = 2, lwd = 2)
+# beta.hat.lm <- coef(lm(y ~ x))
+# abline(beta.hat.lm[1], beta.hat.lm[2], col = "red", lwd = 2)
+# beta.hat.wls <- fit$params
+# abline(beta.hat.wls[1], beta.hat.wls[2], col = "blue", lwd = 2)
diff --git a/tensorPredictors/inst/examples/num_deriv.R b/tensorPredictors/inst/examples/num_deriv.R
new file mode 100644
index 0000000..4ca2006
--- /dev/null
+++ b/tensorPredictors/inst/examples/num_deriv.R
@@ -0,0 +1,35 @@
+# Derivative of matrix product
+A <- matrix(rnorm(3 * 3), 3, 3)
+A <- A + t(A)
+B <- matrix(rnorm(3 * 4), 3, 4)
+
+# F(A) = A B
+# DF(A) = B' x I
+stopifnot(all.equal(
+    num.deriv(function(X) X %*% B, A),
+    t(B) %x% diag(nrow(A))
+))
+
+# Symmetric case, constraint A = A' (equiv to being a function of vech(A) only)
+# F(A) = A B     for A = A'
+# DF(A) = B' x I
+stopifnot(all.equal(
+    num.deriv(function(X) X %*% B, A, sym = TRUE),
+    (t(B) %x% diag(nrow(A))) %*% D(nrow(A)) %*% t(D(nrow(A)))
+))
+
+# Derivative of Kronecker Product
+A <- matrix(rnorm(3 * 7), 3)
+B <- matrix(rnorm(5 * 4), 5)
+
+P <- diag(ncol(A)) %x% K(ncol(B), nrow(A)) %x% diag(nrow(B))
+# DF(A) numeric approximation and exact solution
+stopifnot(all.equal(
+    num.deriv(function(X) X %x% B, A),
+    P %*% (diag(prod(dim(A))) %x% c(B))
+))
+# DF(B) numeric approximation and exact solution
+stopifnot(all.equal(
+    num.deriv(function(X) A %x% X, B),
+    P %*% (c(A) %x% diag(prod(dim(B))))
+))
diff --git a/tensorPredictors/src/init.c b/tensorPredictors/src/init.c
index eb12754..54fa54e 100644
--- a/tensorPredictors/src/init.c
+++ b/tensorPredictors/src/init.c
@@ -7,7 +7,10 @@
 // );
 
 /* Tensor Times Matrix a.k.a. Mode Product */
-extern SEXP ttm(SEXP A, SEXP X, SEXP mode);
+extern SEXP ttm(SEXP A, SEXP X, SEXP mode, SEXP op);
+
+/* Matrix Times Vectorized Kronecker product `A vec(B_1 %x% ... %x% B_r)` */
+extern SEXP mtvk(SEXP A, SEXP Bs);
 
 /* Tensor Mode Crossproduct `A_(m) B_(m)^T` */
 extern SEXP mcrossprod(SEXP A, SEXP B, SEXP mode);
@@ -17,8 +20,9 @@ extern SEXP mcrossprod_sym(SEXP A, SEXP mode);
 /* List of registered routines (a.k.a. C entry points) */
 static const R_CallMethodDef CallEntries[] = {
     // {"FastPOI_C_sub", (DL_FUNC) &FastPOI_C_sub, 5},  // NOT USED
-    {"C_ttm",            (DL_FUNC) &ttm,        3},
-    {"C_mcrossprod",     (DL_FUNC) &mcrossprod, 3},
+    {"C_ttm",            (DL_FUNC) &ttm,            4},
+    {"C_mtvk",           (DL_FUNC) &mtvk,           2},
+    {"C_mcrossprod",     (DL_FUNC) &mcrossprod,     3},
     {"C_mcrossprod_sym", (DL_FUNC) &mcrossprod_sym, 2},
     {NULL, NULL, 0}
 };
diff --git a/tensorPredictors/src/mtvk.c b/tensorPredictors/src/mtvk.c
new file mode 100644
index 0000000..67581c4
--- /dev/null
+++ b/tensorPredictors/src/mtvk.c
@@ -0,0 +1,101 @@
+// Suppress stripping R API prefixes, all API functions have the form `Rf_<name>`
+// and public variables are called `R_<name>`.
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+/**
+ * Matrix Times a Vectorized Kronecker product
+ *
+ *      C = A vec(B_1 %x% ... %x% B_r)
+ *
+ * Note the reverse order of the `B_k` in the Kronecker product.
+ *
+ * @param A matrix of dimensions `n` by `pp * qq`
+ * @param Bs list of matrices such that the product there Kronecker product has
+ *  dimensions `pp` by `qq`.
+ */
+extern SEXP mtvk(SEXP A, SEXP Bs) {
+
+    if ((TYPEOF(A) != REALSXP) || !Rf_isMatrix(A)) {
+        Rf_error("First argument must be a numeric matrices");
+    }
+
+    // extract dimensions from `A` and `Bs`
+    size_t nrow = Rf_nrows(A);
+    size_t ncol = Rf_ncols(A);
+    size_t r = Rf_length(Bs);
+    size_t pp = 1;  // `nrow(Reduce("%x%", Bs))` see below
+    size_t qq = 1;  // `ncol(Reduce("%x%", Bs))` see below
+
+    // retrieve dimensions and direct memory access for each component in
+    // reverse order
+    // (Allocated memory freed at function exit and errors)
+    double** bs = (double**)R_alloc(r, sizeof(double*));
+    // dimensions of reshaped `A` columns, we treat the matrix `A` as an
+    // `nrow` by `p[1]` by ... by `p[2 r]` tensor where the `p[1:r]` dimensions
+    // as the row and `p[(r + 1):(2 r)]` the column dimensions of the Kronecker
+    // product components.
+    size_t* p = (size_t*)R_alloc(2 * r + 1, sizeof(size_t));
+    // Multi-Index into the reshaped `A`:
+    // `A[i, j] == do.call(array(A, dim = c(nrow, p), c(i, J)))`
+    size_t* J = (size_t*)R_alloc(2 * r + 1, sizeof(size_t));
+
+    // reversed order of Kronecker components
+    for (size_t k = 0; k < r; ++k) {
+        SEXP Bk = VECTOR_ELT(Bs, k);
+        if ((TYPEOF(Bk) == REALSXP) && Rf_isMatrix(Bk)) {
+            bs[r - 1 - k] = REAL(Bk);
+            pp *= (p[    r - 1 - k] = Rf_nrows(Bk));
+            qq *= (p[2 * r - 1 - k] = Rf_ncols(Bk));
+            // set all zero
+            J[k] = J[k + r] = 0;
+        } else {
+            Rf_error("Second argument must be a list of numeric matrices");
+        }
+    }
+
+    // Additional last element such that the condition `p[2 * r] <= (++J[2 * r])`
+    // (evaluated exactly once) is false. Allows to skip the check `k < 2 * r`
+    // in the loop for updating the Multi-Index `J`.
+    p[2 * r] = 127; // biggest value of smallest signed type
+    J[2 * r] = 0;
+
+    // check if dimensions match
+    if ((ncol != pp * qq) || (r < 1)) {
+        Rf_error("Dimension Missmatch");
+    }
+
+    // Create new R result vector
+    SEXP C = PROTECT(Rf_allocVector(REALSXP, nrow));
+    double* c = REAL(C);
+    memset(c, 0, nrow * sizeof(double));
+
+    // main operation (single iteration over `A`)
+    double* a = REAL(A);
+    for (size_t j = 0; j < ncol; ++j) {
+        // Compute `prod_k (B_k)_{J_k, J_k+r}` (identical for each row)
+        double prod_BJ = 1.0;
+        for (size_t k = 0; k < r; ++k) {
+            prod_BJ *= bs[k][J[k] + p[k] * J[k + r]];
+        }
+
+        // Add `j`th summand `A_i,J prod_k (B_k)_{J_k, J_k+r}` to each component
+        for (size_t i = 0; i < nrow; ++i) {
+            c[i] += a[i + j * nrow] * prod_BJ;
+        }
+
+        // Compute matching Multi-Index `J` for reshaped `A` columns
+        // In `R` the relation between `j` and `J` is
+        // `A[i, j] == do.call(array(A, dim = c(nrow, p), c(i, J)))`
+        // Note: the check `k < 2 * r` is skipped as described above.
+        for (size_t k = 0; p[k] <= (++J[k]); ++k) {
+            J[k] = 0;
+        }
+    }
+
+    // Onle the result object `C` needs to be unprotected
+    UNPROTECT(1);
+
+    return C;
+}
diff --git a/tensorPredictors/src/ttm.c b/tensorPredictors/src/ttm.c
index 51a24e1..f913764 100644
--- a/tensorPredictors/src/ttm.c
+++ b/tensorPredictors/src/ttm.c
@@ -1,6 +1,8 @@
 // The need for `USE_FC_LEN_T` and `FCONE` is due to a Fortran character string
 // to C incompatibility. See: Writing R Extentions: 6.6.1 Fortran character strings
 #define USE_FC_LEN_T
+// Disables remapping of R API functions from `Rf_<name>` or `R_<name>`
+#define R_NO_REMAP
 #include <R.h>
 #include <Rinternals.h>
 #include <R_ext/BLAS.h>
@@ -14,31 +16,39 @@
  * @param A multi-dimensional array
  * @param B matrix
  * @param m mode index (1-indexed)
+ * @param op boolean if `B` is transposed
  */
-extern SEXP ttm(SEXP A, SEXP B, SEXP m) {
+extern SEXP ttm(SEXP A, SEXP B, SEXP m, SEXP op) {
 
     // get zero indexed mode
-    int mode = asInteger(m) - 1;
+    const int mode = Rf_asInteger(m) - 1;
 
     // get dimension attribute of A
-    SEXP dim = getAttrib(A, R_DimSymbol);
+    SEXP dim = Rf_getAttrib(A, R_DimSymbol);
+
+    // operation on `B` (transposed or not)
+    const int trans = Rf_asLogical(op);
+
+    // as well as `B`s dimensions
+    const int nrow = Rf_nrows(B);
+    const int ncol = Rf_ncols(B);
 
     // validate mode (mode must be smaller than the nr of dimensions)
-    if (mode < 0 || length(dim) <= mode) {
-        error("Illegal mode");
+    if (mode < 0 || Rf_length(dim) <= mode) {
+        Rf_error("Illegal mode");
     }
 
     // and check if B is a matrix of non degenetate size
-    if (!isMatrix(B)) {
-        error("Expected a matrix as second argument");
+    if (!Rf_isMatrix(B)) {
+        Rf_error("Expected a matrix as second argument");
     }
-    if (!nrows(B) || !ncols(B)) {
-        error("Zero dimension detected");
+    if (!Rf_nrows(B) || !Rf_ncols(B)) {
+        Rf_error("Zero dimension detected");
     }
 
     // check matching of dimensions
-    if (INTEGER(dim)[mode] != ncols(B)) {
-        error("Dimension missmatch (mode dim not equal to ncol)");
+    if (INTEGER(dim)[mode] != (trans ? nrow : ncol)) {
+        Rf_error("Dimension missmatch");
     }
 
     // calc nr of response elements `prod(dim(A)[-mode]) * ncol(A)` (size of C),
@@ -48,22 +58,19 @@ extern SEXP ttm(SEXP A, SEXP B, SEXP m) {
     //  `stride[1] <- dim(A)[mode]`
     //  `stride[2] <- prod(dim(A)[-seq_len(mode)])`
     int stride[3] = {1, INTEGER(dim)[mode], 1};
-    for (int i = 0; i < length(dim); ++i) {
+    for (int i = 0; i < Rf_length(dim); ++i) {
         int size = INTEGER(dim)[i];
         // check for non-degenetate dimensions
         if (!size) {
-            error("Zero dimension detected");
+            Rf_error("Zero dimension detected");
         }
-        sizeC *= (i == mode) ? nrows(B) : size;
+        sizeC *= (i == mode) ? (trans ? ncol : nrow) : size;
         stride[0] *= (i < mode) ? size : 1;
         stride[2] *= (i > mode) ? size : 1;
     }
 
-    // as well as the matrix rows (cols only needed once)
-    int nrow = nrows(B);
-
     // create response object C
-    SEXP C = PROTECT(allocVector(REALSXP, sizeC));
+    SEXP C = PROTECT(Rf_allocVector(REALSXP, sizeC));
 
     // raw data access pointers
     double* a = REAL(A);
@@ -74,20 +81,27 @@ extern SEXP ttm(SEXP A, SEXP B, SEXP m) {
     const double zero = 0.0;
     const double one  = 1.0;
     if (mode == 0) {
-        // mode 1: (A x_1 B)_(1) = B A_(1) as a single Matrix-Matrix prod
-        F77_CALL(dgemm)("N", "N", &nrow, &stride[2], &stride[1], &one,
+        // mode 1: (A x_1 op(B))_(1) = op(B) A_(1) as a single Matrix-Matrix
+        // multiplication
+        F77_CALL(dgemm)(trans ? "T" : "N", "N",
+            (trans ? &ncol : &nrow), &stride[2], &stride[1], &one,
             b, &nrow, a, &stride[1],
-            &zero, c, &nrow FCONE FCONE);
+            &zero, c, (trans ? &ncol : &nrow)
+            FCONE FCONE);
     } else {
         // Other modes can be written as blocks of matrix multiplications
+        // (A x_m op(B))_(m)' = A_(m)' op(B)'
         for (int i2 = 0; i2 < stride[2]; ++i2) {
-            F77_CALL(dgemm)("N", "T", &stride[0], &nrow, &stride[1], &one,
+            F77_CALL(dgemm)("N", trans ? "N" : "T",
+                &stride[0], (trans ? &ncol : &nrow), &stride[1], &one,
                 &a[i2 * stride[0] * stride[1]], &stride[0], b, &nrow,
-                &zero, &c[i2 * stride[0] * nrow], &stride[0] FCONE FCONE);
+                &zero, &c[i2 * stride[0] * (trans ? ncol : nrow)], &stride[0]
+                FCONE FCONE);
         }
     }
     /*
-    // Tensor Times Matrix / Mode Product (reference implementation)
+    // (reference implementation)
+    // Tensor Times Matrix / Mode Product for `op(B) == B`
     memset(c, 0, sizeC * sizeof(double));
     for (int i2 = 0; i2 < stride[2]; ++i2) {
         for (int i1 = 0; i1 < stride[1]; ++i1) { // stride[1] == ncols(B)
@@ -102,11 +116,11 @@ extern SEXP ttm(SEXP A, SEXP B, SEXP m) {
     */
 
     // finally, set result dimensions
-    SEXP newdim = PROTECT(allocVector(INTSXP, length(dim)));
-    for (int i = 0; i < length(dim); ++i) {
-        INTEGER(newdim)[i] = (i == mode) ? nrows(B) : INTEGER(dim)[i];
+    SEXP newdim = PROTECT(Rf_allocVector(INTSXP, Rf_length(dim)));
+    for (int i = 0; i < Rf_length(dim); ++i) {
+        INTEGER(newdim)[i] = (i == mode) ? (trans ? ncol : nrow) : INTEGER(dim)[i];
     }
-    setAttrib(C, R_DimSymbol, newdim);
+    Rf_setAttrib(C, R_DimSymbol, newdim);
 
     // release C to the hands of the garbage collector
     UNPROTECT(2);