diff --git a/default_input_values.h b/default_input_values.h
index 3437dc92f..bec54816b 100644
--- a/default_input_values.h
+++ b/default_input_values.h
@@ -204,7 +204,7 @@
 #define _default_subprocess_flag 0
 #define _default_lowmem_flag 0
 
-#define _default_g_barrier_monomials_convergence 0
+#define _default_g_barrier_monomials_convergence 1
 
 /* default input values for QUDA interface */
 /* These follow the recommendations of https://github.com/lattice/quda/wiki/Multigrid-Solver */
diff --git a/doc/bibliography.bib b/doc/bibliography.bib
index 1e4fcf20c..616d45e69 100644
--- a/doc/bibliography.bib
+++ b/doc/bibliography.bib
@@ -7904,3 +7904,17 @@ @inbook{Joo2013
 	year = "2013"
 }
 
+@article{Kostrzewa:2022hsv,
+    author = "Kostrzewa, Bartosz and Bacchio, Simone and Finkenrath, Jacob and Garofalo, Marco and Pittler, Ferenc and Romiti, Simone and Urbach, Carsten",
+    collaboration = "ETM",
+    title = "{Twisted mass ensemble generation on GPU machines}",
+    eprint = "2212.06635",
+    archivePrefix = "arXiv",
+    primaryClass = "hep-lat",
+    doi = "10.22323/1.430.0340",
+    journal = "PoS",
+    volume = "LATTICE2022",
+    pages = "340",
+    year = "2023"
+}
+
diff --git a/doc/quda.tex b/doc/quda.tex
index 85fa14095..81b7b6b83 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -1,24 +1,24 @@
 %author: Mario Schroeck <mario.schroeck@roma3.infn.it>
 %author: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
 %date: 04/2015
-%date: 06/2017, 12/2017, 06/2018
+%date: 06/2017, 12/2017, 06/2018, 08/2019, 05/2022, 05/2023
 
 \subsection{QUDA: A library for QCD on GPUs}\label{subsec:quda}
 
 
-The QUDA \cite{Clark:2009wm, Babich:2011np, Strelchenko:2013vaa} interface provides access to QUDA's solvers both in the HMC and as a solver interface for propagator production.
+The QUDA \cite{Clark:2009wm, Babich:2011np, Strelchenko:2013vaa} interface provides access to QUDA's solvers both in the HMC~\cite{Kostrzewa:2022hsv} and as a solver interface for propagator production.
 
 \subsubsection{Design goals of the interface}
 The QUDA interface has been designed with the following goals in mind, sorted by priority:
 \begin{enumerate}
 	\item \emph{Safety.} Naturally, highest priority is given to the correctness of the output of the interface. 
 	This is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines.
-	\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseQudaInverter} is introduced which, when set to {\ttfamily yes}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} are supported.\footnote{{\ttfamily DBCLOVER} is supported by the interface but not by QUDA as of version 0.7.0.}
-	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). In the file {\ttfamily operators.c}, the QUDA library is initialized when an operator is initialized which has set {\ttfamily UseQudaInverter = yes}. There, the actual call to the inverter is conditionally replaced with a call to the QUDA interface.
+\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseExternalInverter} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER, DBCLOVER} are supported. In the HMC, the same flag can be used to offload solves for the \texttt{DET, DETRATIO, CLOVERDET, CLOVERDETRATIO, RAT, RATCOR, NDRAT, NDRATCOR, NDCLOVERRAT} and \texttt{NDCLOVERRATCOR} monomials.
+	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). The QUDA interface is entered .
 	\item \emph{Performance.} The higher priority of the previous items results in small performance detriments. In particular:
 	\begin{itemize}
-		\item tmLQCD's $\theta$-boundary conditions are not compatible with QUDA's 8 and 12 parameter reconstruction of the gauge fields (as of QUDA-0.7.0). Therefore reconstruction/compression is deactivated by default, although it may be activated via the input file, see below.
-		\item The gaugefield is transferred each time to the GPU before the inversion starts in order to ensure not to miss any modifications of the gaugefield.
+		\item tmLQCD's $\theta$-boundary conditions are not compatible with QUDA's 8 and 12 parameter reconstruction of the gauge fields (as of QUDA-1.1.0). Therefore reconstruction/compression is deactivated by default, although it may be activated via the input file, see below.
+    \item The state of the host and device gauge fields is kept synchronised automatically and the gauge field is only transferred if it changes. This tracking of the state also applies to the clover field and the MG preconditioner setup.
 	\end{itemize}
 \end{enumerate}
 
@@ -83,21 +83,66 @@ \subsubsection{Usage}
 \end{verbatim}
 and the operator of interest will be inverted using QUDA. The initialization of QUDA is done automatically within the operator initialization,  the QUDA library should be finalized by a call to {\ttfamily \_endQuda()} just before finalizing MPI. When you use the QUDA interface for work that is being published, don't forget to cite \cite{Clark:2009wm, Babich:2011np, Strelchenko:2013vaa}.
 
+At the operator level as well as for fermionic monomials in the HMC, the following additional parameters affect QUDA solvers:
+\begin{itemize}
+  \item \texttt{Solver}: Type of solver to employ. Possible values for QUDA:
+    \begin{itemize}
+      \item \texttt{CG} (conjugate gradient)
+      \item \texttt{CGMMS} (multishift conjugate gradient)
+      \item \texttt{CGMMSND} (multishift conjugate gradient for the non-degenerate operator)
+      \item \texttt{BICGSTAB} (bi-conjugate gradient, useful for non-twisted Wilson and Wilson-clover fermions)
+      \item \texttt{MG} (multigrid-preconditioned GCR)
+    \end{itemize}
+    Unlike the tmLQCD-native solvers, QUDA solvers are always run in mixed-precision and there is no differentiation between the names of mixed- and fixed-precision solvers, the inner solver precision is instead set by \texttt{UseSloppyPrecision}.
+    (default: \texttt{CG})
+  \item \texttt{UseCompression}: Employ gauge compression. Possible values: 
+    \begin{itemize}
+      \item $8$ (use 8-real gauge compression)
+      \item $12$ (use two-row gauge compression)
+      \item $18$ (disable gauge compression)
+    \end{itemize}
+    Since tmLQCD always uses twisted fermionic boundary conditions, the default $18$ is recommended unless you know what you are doing. Note that if the global fermionic boundary condition parameters \texttt{Theta[X,Y,Z,T]} are set non-zero, this setting is overriden by default. See \texttt{FermionBC} to force a particular type of fermionic boundary conditions in QUDA. Note that the tmLQCD residual check will fail if compression is enabled and non-twisted boundary conditions are forced.
+  \item \texttt{UseSloppyPrecision}: Employ this precision for the inner solver of a mixed-precision solver (outer solvers are always run in double precision). Possible values:
+    \begin{itemize}
+      \item \texttt{no} / \texttt{double} (inner solver running in double precision)
+      \item \texttt{single} / \texttt{float} (inner solver running in single precision)
+      \item \texttt{half} (inner solver running in half precision)
+    \end{itemize}
+    (string, default: \texttt{double})
+  \item \texttt{RefinementPrecision}: When the operator or monomial uses the multishift (\texttt{cgmms[nd]}) solver and offloads to QUDA, this parameter sets the inner solver precision of shift-by-shift refinement solves. In practice, one might set \texttt{UseSloppyPrecision = single} and \texttt{RefinementPrecision = half}. This will iterate the residuals in the multishift solver up to single precision and then refine each solution using a double-half mixed-precision CG.
+\end{itemize}
+
+In additition, for the gauge monomial, the parameter \texttt{UseExternalLibrary = quda} can be used to offload the gauge force to QUDA.
+
+Finally, for the \texttt{GRADIENTFLOW} online measurement, the parameter \texttt{UseExternalLibrary = quda} will offload the gradient flow to QUDA.
+
 \subsubsection{General settings}
 Some properties of the QUDA interface can be configured via the {\ttfamily ExternalInverter} section.
-\begin{verbatim}
-BeginExternalInverter QUDA
-  FermionBC = [theta, pbc, apbc]
-EndExternalInverter
-\end{verbatim}
+\begin{itemize}
+  \item \texttt{FermionBC} Forces twisted ({\ttfamily theta}), periodic ({\ttfamily pbc}) or antiperiodic ({\ttfamily apbc}) temporal quark field boundary conditions irrespective of what has been set for \texttt{ThetaT}. This setting exists because at the time of writing (2017.12.28), there seems to be a bug or incompatibility in QUDA which causes (anti-)periodic boundary conditions with gauge compression to produce incorrect propagators. Use with care as the residual check using tmLQCD operators will suggest a non-converged residual.
+  \item \texttt{Pipeline} The pipeline length for fused operations in some solvers (for the tmLQCD QUDA interface, at the time of writing in May 2023, this is just GCR). (positive integer, default: $0$)
+  \item \texttt{gcrNkrylov} 
+  \item \texttt{ReliableDelta}
+\end{itemize}
 
-The option {\ttfamily FermionBC} shown above forces twisted ({\ttfamily theta}), periodic ({\ttfamily pbc}) or antiperiodic ({\ttfamily apbc}) temporal quark field boundary conditions.
-This setting exists because at the time of writing (2017.12.28), there seems to be a bug or incompatibility in QUDA which causes (anti-)periodic boundary conditions with gauge compression to produce incorrect propagators.
+\subsubsection{More advanced settings}
+To achieve higher performance you may choose single (default) or even half precision as sloppy precision for the inner solver of the mixed precision inverter with reliable updates. After {\ttfamily BeginOperator} and before {\ttfamily EndOperator} set {\ttfamily UseSloppyPrecision = double|single|half}.
+The MG-preconditioned GCR solver only works in double-single mixed precision, but the null vectors are stored in half precision as recommended by Kate Clark.
+
+To activate compression of the gauge fields (in order to save bandwidth and thus to achieve higher performance), set {\ttfamily UseCompression = 8|12|18} within {\ttfamily BeginOperator} and {\ttfamily EndOperator}. 
+The default is 18 which corresponds to no compression. 
+Note that if you use compression, trivial (anti)periodic boundary conditions will be applied to the gauge fields, instead of the default $\theta$-boundary conditions. 
+As a consequence, the residual check on tmLQCD side will fail. 
+Moreover, compression is not applicable when using general $\theta$-boundary conditions in the spatial directions. 
+If trying to do so, compression will be de-activated automatically and the user gets informed via the standard output.
+The \texttt{FermionBC} setting can be used to force particular temporal boundary conditions to be applied to the gauge field in the Dirac operator.
+
+\subsubsection{Functionality}
+The QUDA interface can currently be used to invert {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} within a 4D multi-GPU (MPI) parallel environment with CG, BICGSTAB or MG-preconditioned GCR. QUDA uses even-odd preconditioning, if wanted ({\ttfamily UseEvenOdd = yes}), and the interface is set up to use a mixed precision solver by default. For more details on the QUDA settings check the function {\ttfamily \_initQuda()} in {\ttfamily quda\_interface.c}.
 
 \subsubsection{QUDA-MG interface}
 The interface has support for the QUDA Multigrid (MG) solver implementation and allows a number of parameters to be adjusted in order to tune the MG setup.
 The defaults for these parameters follow the recommendations of \url{https://github.com/lattice/quda/wiki/Multigrid-Solver}, which also provides useful hints for further tuning.
-Although some of the parameters can be set on a per-level basis, the interface currently only exposes a single setting for all levels, where appropriate.
 The K-cycle is used by default and there is currently no user-exposed option for changing this.
 
 The MG-preconditioned GCR solver is selected as follows:
@@ -114,50 +159,60 @@ \subsubsection{QUDA-MG interface}
 EndOperator
 \end{verbatim}
 
-The MG setup can be tuned using the following parameters in the \texttt{BeginExternalInverter QUDA} section:
+\paragraph{Basic MG parameters:} The MG setup can be tuned using the following parameters in the \texttt{BeginExternalInverter QUDA} section:
 \begin{itemize}
-  \item{ \texttt{MGNumberOfLevels}: number of levels to be used in the MG, $3$ is usually ideal but $2$ can be similarly efficient depending on the quark mass (positive integer, default $3$) }
-  \item{ \texttt{MGSetupSolver}: solver used for generating null vectors. \texttt{CG} or \texttt{BiCGstab} (default \texttt{CG}). Usage of \texttt{BiCGstab} may be recommended for Wilson or clover Wilson quarks. }
-  \item{ \texttt{MGSetupSolverTolerance}: relative target residual (unsquared!) during setup phase. (positive float, default $1\cdot10^{-6}$) }
-  \item{ \texttt{MGSetupMaxSolverIterations}: maximum number of iterations during setup phase. (positive integer, default $1000$) }
-  \item{ \texttt{MGCoarseSolverTolerance}: unsquared relative target residual on the coarse grids. (positive float, default $0.25$) }
-  \item{ \texttt{MGNumberOfVectors}: number of null vectors to compute on a per-level basis. (possible values $\left[ 24, 32 \right]$, default $24$)}
-  \item{ \texttt{MGCoarseMaxSolveriterations}: maximum number of iterations on coarse grids. (positive integer, default $75$) }
-  \item{ \texttt{MGEnableSizeThreeBlocks}: By default, QUDA has limited support for size $3$ aggregates. If set to \emph{yes}, the automatic blocking algorithm will attempt to use them for lattice extents divisible by $3$ when the local lattice extent at a given level is smaller than $16$ aggregate sites. This requires you to instantiate the necessary block sizes in QUDA (see comments below). (boolean \emph{yes} or \emph{no}, default \emph{no}) }
-  \item{ \texttt{MGBlockSizes[X,Y,Z,T]}: aggregate sizes on each level. When these are set for a given lattice dimension, the automatic blocking algorithm for that dimension is overridden and the specified blockings are forced. When the required aggregate sizes are not instantiated in QUDA, the setup phase will fail with an informative error message. (comma-separated list of integers, for a three level solver, for example, this needs to be specified for the first and second level)} 
-  \item{ \texttt{MGSmootherTolerance}: unsquared relative target residual of the smoother on all levels. (positive float, default $0.25$) }
-  \item{ \texttt{MGSmootherPreIterations}: number of smoothing steps before coarse grid correction. (zero or positive integer, default $0$)}
-  \item{ \texttt{MGSmootherPostIterations}: number of smoothing steps after prolongation. (zero or positive integer, default $4$)}
-  \item{ \texttt{MGOverUnderRelaxationFactor}: Over- or under-relaxation factor. (positive float, default $0.85$)}
-  \item{ \texttt{MGCoarseMuFactor}: Scaling factor for twisted mass on a per-level basis, accelerates convergence and reduces condition numer of coarse grid. From experience it seems that it's reasonable to set this $>1.0$ only on the coarsest level, but it might also help on intermediate levels. If running with twisted mass, this should always be set and tuned for maximum efficiency. (positive float, usually $ > 1.0$, default $8.0$ from the second level upwards).}
-  \item{ \texttt{MGRunVerify}: Check GPU coarse operators against CPU coarse operators and verify Galerkin projectors during setup phase. This is usually fast enough to always be performed, although sometimes it seems to fail even though the setup works fine. (\emph{yes} or \emph{no}, default \emph{yes}) } 
+  \item{ \texttt{MGNumberOfLevels}: Number of levels to be used in the MG, $3$ is usually ideal but $2$ can be similarly efficient depending on the quark mass (positive integer, default: $3$) }
+  \item{ \texttt{MGSetupSolver}: Solver used for generating null vectors. Possible values:
+    \begin{itemize}
+      \item \texttt{CG} (conjugate gradient)
+      \item \texttt{CGNE} (conjugate gradient normal equations)
+      \item \texttt{CGNR} (conjugate gradient normal residuals)
+      \item \texttt{CACG} (communication-avoiding conjugate gradient)
+      \item \texttt{CACGNE} (communication-avoiding conjugate gradient normal equations)
+      \item \texttt{CACGNR} (communication-avoiding conjugate gradient normal residuals)
+      \item \texttt{BiCGstab} (stabilised bi-conjugate gradient)
+    \end{itemize}
+    Usage of \texttt{BiCGstab} may be recommended for Wilson or clover Wilson quarks while \texttt{CG} should always be used for twised mass or twisted clover quarks. \texttt{CACG} may be beneficial at the strong-scaling limit, see also \texttt{MGSetupCABasisType} and \texttt{MGSetupCABasisSize}. At the time of writing (May 2023), this cannot be set on a per-level basis so the effect of the communication-avoiding setup solver on the setup time may be limited or even detrimental.
+        (default: \texttt{CG})}
+  \item{ \texttt{MGVerbosity}: Verbosity of the solver on a per-level basis. Possible values:
+    \begin{itemize}
+      \item \texttt{silent} (suppress any additional output)
+      \item \texttt{summarize} (summary information such as number of iterations and achieved residual in the respective smoother or solver)
+      \item \texttt{verbose} (verbose information such as per-iteration residual printing of the respective solver or smoother)
+      \item \texttt{debug\_verbose} (extremely verbose information at each iteration of the respective smoother or solver)
+    \end{itemize}
+    (comma-separated list of strings, default: \texttt{silent} on all levels)}
+  \item{ \texttt{MGSetupSolverTolerance}: Relative target residual (unsquared!) during setup phase on a per-level basis. (comma-separated list of positive floats, default: $1\cdot10^{-6}$ on all levels) }
+  \item{ \texttt{MGSetupMaxSolverIterations}: Maximum number of iterations during setup phase on a per-level basis. (comma-separated list of positive integers, default: $1000$ on all levels) }
+  \item{ \texttt{MGCoarseSolverTolerance}: Unsquared relative target residual on the coarse grids on a per-level basis. (comma-separated list of positive floats, default: $0.25$ on all levels) }
+  \item{ \texttt{MGNumberOfVectors}: Number of null vectors to compute on a per-level basis. (comma-separated list of integers, possible values $\left[ 24, 32 \right]$, default: $24$ on all levels)}
+  \item{ \texttt{MGCoarseMaxSolveriterations}: Maximum number of iterations on coarse grids on a per-level basis. (comma-separated list of positive integers, default: $75$ on all levels) }
+  \item{ \texttt{MGEnableSizeThreeBlocks}: Historically and by default, QUDA has had limited support for size $3$ aggregates. If set to \emph{yes}, the automatic blocking algorithm will attempt to use them for lattice extents divisible by $3$ when the local lattice extent at a given level is smaller than $16$ aggregate sites. This may require you to instantiate the necessary block sizes in QUDA (see comments below). In recent (2022 and later) commits of the QUDA \texttt{develop} branch, this can usually be set to \emph{yes} without having to modify QUDA. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
+  \item{ \texttt{MGBlockSizes[X,Y,Z,T]}: Aggregate sizes on each level. When these are set non-zero for a given lattice dimension and a given level, the automatic blocking algorithm for that dimension and level is overridden and the specified blockings are forced. When the required aggregate sizes are not instantiated in QUDA, the setup phase will fail with an informative error message. (comma-separated list of positive integers, for a three level solver, for example, two numbers needs to be specified: aggregate size on the finest and intermediate level, default: \texttt{0, 0})}
 \end{itemize}
 
-If no blocking is specified manually, the aggregation parameters are set automatically as follows:
+\paragraph*{Automatic blocking:} If no blocking is specified manually, the aggregation parameters are set automatically as follows:
 \begin{itemize}
-  \item{ A default block size of $4$ is attempted if the MPI-partitioned fine or aggregate lattice extent is larger or equal to $16$ lattice sites. }
-  \item{ If the number of aggregate lattice sites in a given direction is even and smaller than $16$, a block size of $2$ is used. }
-  \item{ The option \texttt{MGEnableSizeThreeBlocks} can be set to \texttt{yes}. Then, for levels coarser than the fine grid, extents smaller than $16$ and divisible by $3$, a block size of $3$ will be used. This will almost certainly require the addition of instantiations of block sizes to QUDA in the restrictor and transfer operator. (\texttt{lib/restrictor.cu} and \texttt{lib/transfer\_util.cu}) }
-  \item{ In all other cases, aggregation is disabled for this direction and level. This includes, for instance, extents divisible by primes other than $2$ or $3$. }
+  \item{ A default block size of $4$ is attempted if the MPI-partitioned local lattice extent on the current level is even and larger than or equal to $16$ lattice sites. }
+  \item{ If the number of lattice sites in a given direction on the current level is even and smaller than $16$, a block size of $2$ is used. }
+  \item{ The option \texttt{MGEnableSizeThreeBlocks} can be set to \texttt{yes}. Then, for levels coarser than the fine grid, extents smaller than $16$ and divisible by $3$, a block size of $3$ will be used. This might require the addition of instantiations of block sizes to QUDA in the restrictor and transfer operator. (\texttt{lib/restrictor.cu} and \texttt{lib/transfer\_util.cu}) }
+  \item{ In all other cases, aggregation is disabled for this direction and level. This includes, for instance, extents divisible by primes other than $2$ or $3$. This means that if you need blocks of $5$ or $7$ for geometry reasons, you will need to choose a setup manually.}
 \end{itemize}
 
-
-Note that at the time of writing (2017.12.28), only double-single mixed-precision is supported for the MG-preconditioned GCR solver and the solve will abort if a double-half precision solve is attempted.
-
 A typical MG setup might look like this for twisted mass clover quarks: 
 
 \begin{verbatim}
 BeginExternalInverter QUDA
   MGNumberOfLevels = 3
   MGSetupSolver = cg
-  MGSetupSolverTolerance = 1e-6
-  MGSetupMaxSolverIterations = 1000
-  MGCoarseSolverTolerance = 0.25
-  MGCoarseSolverIterations = 75
-  MGSmootherTolerance = 0.25
-  MGSmootherPreIterations = 2
-  MGSmootherPostIterations = 4
-  MGOverUnderRelaxationFactor = 0.85
+  MGSetupSolverTolerance = 1e-6, 1e-6
+  MGSetupMaxSolverIterations = 1000, 1000
+  MGCoarseSolverTolerance = 0.25, 0.25, 0.25
+  MGCoarseSolverIterations = 75, 75, 75
+  MGSmootherTolerance = 0.25, 0.25, 0.25
+  MGSmootherPreIterations = 2, 2, 2
+  MGSmootherPostIterations = 4, 4, 4
+  MGOverUnderRelaxationFactor = 0.85, 0.85, 0.85
   MGCoarseMuFactor = 1.0, 1.0, 12.0
   MGNumberOfVectors = 24, 24, 32
   MGRunVerify = yes
@@ -175,34 +230,217 @@ \subsubsection{QUDA-MG interface}
   MGBlockSizesZ = 6, 4
   MGBlockSizesT = 6, 4
   MGSetupSolver = cg
-  MGSetupSolverTolerance = 1e-6
-  MGSetupMaxSolverIterations = 1000
-  MGCoarseSolverTolerance = 0.25
-  MGCoarseSolverIterations = 75
-  MGSmootherTolerance = 0.25
-  MGSmootherPreIterations = 2
-  MGSmootherPostIterations = 4
-  MGOverUnderRelaxationFactor = 0.85
+  MGSetupSolverTolerance = 1e-6, 1e-6
+  MGSetupMaxSolverIterations = 1000, 1000
+  MGCoarseSolverTolerance = 0.25, 0.25, 0.25
+  MGCoarseSolverIterations = 75, 75, 75
+  MGSmootherTolerance = 0.25, 0.25, 0.25
+  MGSmootherPreIterations = 2, 2, 2
+  MGSmootherPostIterations = 4, 4, 4
+  MGOverUnderRelaxationFactor = 0.85, 0.85, 0.85
   MGCoarseMuFactor = 1.0, 1.0, 12.0
   MGRunVerify = yes
-  MGEnableSizeThreeBlocks = no
 EndExternalInverter
 \end{verbatim}
 
-\subsubsection{More advanced settings}
-To achieve higher performance you may choose single (default) or even half precision as sloppy precision for the inner solver of the mixed precision inverter with reliable updates. After {\ttfamily BeginOperator} and before {\ttfamily EndOperator} set {\ttfamily UseSloppyPrecision = double|single|half}.
-The MG-preconditioned GCR solver only works in double-single mixed precision, but the null vectors are stored in half precision as recommended by Kate Clark.
+Note that at the time of writing (2017.12.28), only double-single mixed-precision is supported for the MG-preconditioned GCR solver and the solve will abort if a double-half precision solve is attempted. It should be noted that the null vectors are kept in half precision as recommended, however.
 
-To activate compression of the gauge fields (in order to save bandwidth and thus to achieve higher performance), set {\ttfamily UseCompression = 8|12|18} within {\ttfamily BeginOperator} and {\ttfamily EndOperator}. 
-The default is 18 which corresponds to no compression. 
-Note that if you use compression, trivial (anti)periodic boundary conditions will be applied to the gauge fields, instead of the default $\theta$-boundary conditions. 
-As a consequence, the residual check on tmLQCD side will fail. 
-Moreover, compression is not applicable when using general $\theta$-boundary conditions in the spatial directions. 
-If trying to do so, compression will be de-activated automatically and the user gets informed via the standard output.
-The \texttt{FermionBC} setting can be used to force particular temporal boundary conditions to be applied to the gauge field in the Dirac operator.
+\paragraph{Further basic MG parameters:}
+\begin{itemize}
+  \item{ \texttt{MGSmootherTolerance}: unsquared relative target residual of the smoother on a per-level basis. (comma-separated list of positive floats, default: $0.25$ on all levels) }
+  \item{ \texttt{MGSmootherPreIterations}: number of smoothing steps before coarse grid correction on a per-level basis. (comma-separated list of zero or positive integers, default: $0$ on all levels)}
+  \item{ \texttt{MGSmootherPostIterations}: number of smoothing steps after prolongation on a per-level basis. (comma-separated list of zero or positive integers, default: $4$ on all levels)}
+  \item{ \texttt{MGOverUnderRelaxationFactor}: Over- or under-relaxation factor on a per-level basis. (comma-separated list of positive floats, default: $0.85$ on all levels)}
+  \item{ \texttt{MGCoarseMuFactor}: Scaling factor for twisted mass on a per-level basis, accelerates convergence and reduces condition numer of coarse grid solves. From experience it seems that it's reasonable to set this $>1.0$ only on the coarsest level, but sometimes it might also help on intermediate levels. If running with twisted mass, this should always be set and tuned for maximum efficiency. When using coarse-grid deflation (see \texttt{MGEigSolverRequireConvergence}), this should usually be set to $1.0$ on all levels. (comma-separated list of positive floats, usually $ > 1.0$, default $8.0$ from the second level upwards).}
+  \item{ \texttt{MGSetup2KappaMu}: The value of $2\kappa\mu$ which should be used during the MG setup process. This is important in the HMC for standard twisted mass fermions, for example, because the setup should always be performed with the smallest quark mass to be employed in a simulation and it might be that a monomial with a heavier twisted quark mass is the first to call to MG and to thus trigger the setup. Generally this is set to the target light twisted quark mass. Setting this to $0.0$ implies that it is ignored. (float, default: $0.0$) }
+  \item{ \texttt{MGReuseSetupMuThreshold}: When the twisted quark mass is changed between solves using the MG solver, the MG setup is usually \emph{updated} for this new $\mu$ value. One can attempt to reuse the MG setup for solves with different $\mu$ values up to this threshold, i.e., when the condition $x < 2\kappa\cdot|\mu_\mathrm{old} - \mu_\mathrm{new}|$ holds. (positive float, default: \texttt{2*DBL\_EPSILON})}
+  \item{ \texttt{MGRefreshSetupMDUThreshold}: When the MG is used in the HMC, the MG setup must be regularly refreshed by running a few iterations of the setup solver on the current set of approximate null vectors in order to evolve these with the changing gauge field. A good rule of thumb is to perform this setup refresh about twice per coarsest time step. In other words: for a trajectory length $\tau$ and $N$ integration steps on the coarsest time scale, the refresh should be performed at intervals of $(\tau/(2N)-\epsilon)$ MDUs, where $\epsilon$ is a small number to make sure that the threshold is hit at every half-step of the integrator. (positive float, default: \texttt{2*DBL\_EPSILON})}
+  \item{ \texttt{MGRefreshSetupMaxSolverIterations}: Number of setup iterations to be performed on a per-level basis when a setup refresh is required, see also \texttt{MGRefreshSetupMDUThreshold} above. This must be specified for the HMC and a value between 20 and 35 is usually sufficient. (comma-separated list of positive integers, default: $0$ on all levels}
+  \item{ \texttt{MGResetSetupMDUThreshold}: When the MG is used in the HMC, the MG setup must be reset (instead of being refreshed) if the gauge configuration changes significantly at some point. This is the case, for example, when a trajectory is rejected. This value must always be set and a possible choice is to set it to the trajectory length. Note that there is an interaction with the preprocessor constant \texttt{TM\_GAUGE\_PROPAGATE\_THRESHOLD}, the default value of which is $3.0$. This is because when a trajectory is rejected, an internal variable which tracks the state of the gauge field is incremented by \texttt{TM\_GAUGE\_PROPAGATE\_THRESHOLD} to ensure that everything that depends on the gauge field is properly updated. This means that if the trajectory length $\tau \geq 3.0$, one should ignore the recommendation and set $x < 3.0$. In general, any value significantly larger than \texttt{MGRefreshSetupMDUThreshold} should be fine. (positive float, default: \texttt{2*DBL\_EPSILON}) }
+  \item{ \texttt{MGRunVerify}: Check GPU coarse-grid operators against CPU reference implementation and verify Galerkin projectors during setup phase. This is usually fast enough to always be performed, although sometimes it seems to fail even though the setup works fine. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
+  \item{ \texttt{MGRunObliqueProjectionCheck}: Measure how well the null vector subspace adjusts the low eigenmode subspace. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
+  \item{ \texttt{MGLowModeCheck}: Measure how well the null vector subspace overlaps with the low eigenmode subspace. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
+\end{itemize}
 
-\subsubsection{Functionality}
-The QUDA interface can currently be used to invert {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} within a 4D multi-GPU (MPI) parallel environment with CG, BICGSTAB or MG-preconditioned GCR. QUDA uses even-odd preconditioning, if wanted ({\ttfamily UseEvenOdd = yes}), and the interface is set up to use a mixed precision solver by default. For more details on the QUDA settings check the function {\ttfamily \_initQuda()} in {\ttfamily quda\_interface.c}.
+The default parameters and an automatically determined aggregation will generally produce a solver which will outperform mixed-precision CG by about one to two orders of magnitude (at the physical point) as long as the value for \texttt{MGCoarseMuFactor} on the coarsest grid is tuned appropriately. Further refinements are possible and improvements by up to another order of magnitude are achievable through smart choices for parameters related to the smoother, the number of vectors used on each level and the tolerances related to the smoother and coarse-grid solvers. These details are dependent on the target quark mass as well as the underlying ensemble and they may be dependent on the particular configuration (for certain ensembles, time to solution fluctuates quite strongly depending on the point in the molecular dynamics history). Finally, the tuning of the algorithmic parameters depends on the characteristics of the particular machine as one may compensate, for example, a poorly performing coarsest-grid operator through more iterations on the intermediate and fine grids. This kind of compensation will lead to a setup which is algorithmically less efficient (more matrix-vector products) but may have a lower overall time-to-solution.
 
+For use in the HMC, a typical MG setup might look something like:
 
+\begin{verbatim}
+BeginExternalInverter QUDA
+  EnablePinnedMemoryPool = yes 
+  EnableDeviceMemoryPool = no
+
+  Pipeline = 16
+  gcrNkrylov = 24
+  MGRunVerify = no
+  MGNumberOfLevels = 3
+  MGNumberOfVectors = 24, 32
+  MGSetupSolver = cg
+  MGSetup2KappaMu = 0.00014900994792
+  MGVerbosity = silent, silent, silent
+  MGSetupSolverTolerance = 5e-7, 5e-7
+  MGSetupMaxSolverIterations = 1500, 1500
+  MGCoarseSolverType = gcr, gcr, cagcr
+
+  MGCoarseMuFactor = 1.0, 1.75, 130.0
+  MgCoarseSolverTolerance = 0.1, 0.25, 0.25
+  MGCoarseMaxSolverIterations = 15, 10, 10
+
+  MGSmootherType = cagcr, cagcr, cagcr
+  MGSmootherTolerance = 0.15, 0.15, 0.1
+
+  MGSmootherPreIterations = 0, 0, 1
+  MGSmootherPostIterations = 1, 3, 1
+
+  MGBlockSizesX = 4, 4
+  MGBlockSizesY = 4, 4
+  MGBlockSizesZ = 4, 2
+  MGBlockSizesT = 4, 2
+
+  MGOverUnderRelaxationFactor = 0.95, 0.95, 0.95
+  MGRefreshSetupMDUThreshold = 0.03124
+  MGRefreshSetupMaxSolverIterations = 35, 35
+  MGResetSetupMDUThreshold = 1.0
+EndExternalInverter
+\end{verbatim}
+
+\paragraph{Advanced MG parameters:}
+\begin{itemize}
+  \item{ \texttt{MGCoarseSolverType}: The type of solver to be used on the intermediate and coarse grids. While this has to be specified for all levels, it will be ignored on the fine grid. Possible values:
+    \begin{itemize}
+      \item \texttt{cagcr} (communication-avoiding generalised conjugate residual)
+      \item \texttt{gcr} (generalised conjugate residual)
+      \item \texttt{mr} (minimal residual)
+    \end{itemize}
+    The recommendation is to use \texttt{gcr} on all intermediate and \texttt{cagcr} on the coarsest level.
+    (comma-separated list of strings, default: \texttt{gcr} on all levels)}
+  \item{ \texttt{MGSmootherType}: The type of algorithm used as a smoother on a per-level basis. Possible values:
+    \begin{itemize}
+      \item \texttt{cagcr} (communication-avoiding generalised conjugate residual)
+      \item \texttt{mr} (minimal residual)
+    \end{itemize}
+    It is often beneficial to employ \texttt{cagcr} on all levels as a smoother.
+    (comma-separated list of strings, default: \texttt{cagcr} on all levels)}
+\end{itemize}
+The fine-tuning parameters for the communication-avoiding solver are the same whether it is used as a smoother or coarse-grid solver.
+\begin{itemize}
+  \item{ \texttt{MGCoarseSolverCABasisType} and \texttt{MGSmootherSolverCABasisType}: The type of basis to use for the communication-avoiding solver on a per-level basis. Ignored unless \texttt{cagcr} is set for \texttt{MGCoarseSolverType} (\texttt{MGSmootherType}) for the given level. Possible values:
+  \begin{itemize}
+    \item \texttt{power} (power basis)
+    \item \texttt{chebyshev} (Chebyshev basis)
+  \end{itemize}
+    (comma-separated list of strings, default: \texttt{power} on all levels)}
+  \item{ \texttt{MGCoarseSolverCABasisSize}: The size of the polynomial basis to use if a communication-avoiding solver is used on a per-level basis. Ignored unless \texttt{cagcr} is set for \texttt{MGCoarseSolverType} for the given level. Values larger than $4$ require \texttt{chebyshev} to be set for the corresponding \texttt{MGCoarseSolverCABasisType}. There is no corresponding parameter for the smoother. (comma-separated list of positive integers, default: $4$ on all levels)}
+  \item{ \texttt{MGCoarseSolverCABasisLambdaMin} and \texttt{MGSmootherSolverCABasisLambdaMin}: Minimum eigenvalue specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGCoarseSolverCABasisType} (\texttt{MGSmootherSolverCABasisType}). The default value of $0.0$ is recommened for safety. (comma-separated list of postitive floats, default: $0.0$ on all levels)}
+  \item{ \texttt{MGCoarseSolverCABasisLambdaMax} and \texttt{MGSmootherSolverCABasisLambdaMax}: Maximum eigenvalue specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGCoarseSolverCABasisType} (\texttt{MGSmootherSolverCABasisType}). The default value of $-1.0$ triggers power iterations which estimate this automatically. (comma-separated list of positive floats, default: $-1.0$ on all levels)}
+\end{itemize}
+The fine-tuning parameters for the communication-avoiding solvers used in the MG setup are nominally the same as for the coarse-grid solver and smoother but this may diverge in principle as different solvers are employed in the setup.
+\begin{itemize}
+  \item{ \texttt{MGSetupCABasisType}: The type of polynomial basis to use if a communication-avoiding solver is used in the setup on a per-level basis. Possible values:
+  \begin{itemize}
+    \item \texttt{power} (power basis)
+    \item \texttt{chebyshev} (Chebyshev basis)
+  \end{itemize}
+    The recommendation is to use \texttt{chebyshev} for stability reasons. (comma-separated list of strings, default: \texttt{power} on all levels)}
+  \item{ \texttt{MGSetupCABasisSize}: The size of the polynomial basis to use if a communication-avoiding solver is used in the setup on a per-level basis. Values larger than $4$ require \texttt{chebyshev} to be set for the corresponding \texttt{MGSetupCABasisType}. (comma-separated list of positive integers, default: $4$ on all levels)}
+  \item{ \texttt{MGSetupCABasisLambdaMin}: Minimum eigenvalues specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGSetupCABasisType}. The default value of $0.0$ is recommended for safety. (comma-separated list of positive floats, default: $0.0$ on all levels)}
+  \item{ \texttt{MGSetupCABasisLambdaMax}: Maximum eigenvalues specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGSetupCABasisType}. The default value of $-1.0$ triggers power iterations which estimate this automatically. (comma-separated list of positive floats, default: $-1.0$ on all levels)}
+\end{itemize}
+
+\paragraph{Coarse-grid-deflated MG solver:}
+Especially with twisted mass quarks, the coarse-grid operator is very poorly conditioned. One way of improving the convergence of the solver is the scaling of the $\mu$ parameter on the coarsest and intermediate grid as described in \texttt{MGCoarseMuFactor} above. A different and more potent way which pays off for propagator inversions (but is not suitable for the HMC) is to use coarse-grid deflation. In this case, the first $n_\mathrm{ev}$ eigenvectors of the coarse-grid operator are computed and the resulting deflation subspace is used to precondition the system. In this case, experience seems to indicate that \texttt{MGCoarseMuFactor} should be set to $1.0$ everywhere as it is no longer required to improve convergence.
+An example for twisted mass Wilson-clover multigrid can be found at \url{https://github.com/lattice/quda/wiki/Twisted-clover-deflated-multigrid} and some further documentation is to be found at \url{https://github.com/lattice/quda/wiki/QUDA's-eigensolvers}.
+
+The options of QUDA's eigensolver are set via the following set of parameters:
+\begin{itemize}
+  \item{ \texttt{MGUseEigSolver}: Whether or not the eigensolver should be run for the operator on a per-level basis. The most common use-case would be to enable the eigensolver on the coarsest grid. (comma-separated list of booleans \emph{yes} or \emph{no}, default: \emph{no} on all levels)}
+  \item{ \texttt{MGEigSolverRequireConvergence}: Require that the eigensolver actually must converge for the requested number of eigenvectors on a per-level basis as specified by \texttt{MGEigSolverNumberOfVectors}. Ignored if the corresponding \texttt{MGUseEigSolver} is set to \texttt{no}. (comma-separated list of booleans, default: \texttt{yes} on all levels)}
+  \item{ \texttt{MGEigSolverUseNormOp}: When set to \texttt{yes} on a per-level basis, the eigensolver uses $MM^\dagger$ instead of $M$. (comma-separated list of booleans, default: \texttt{no} on all levels)}
+  \item{ \texttt{MGEigSolverUseDagger}: When set to \texttt{yes} on a per-level basis, the eigensolver uses $M^\dagger$ instead of $M$ (or $M^\dagger M$ when \texttt{MGEigSolverUseNormOp} is set to \texttt{yes} on the respective level). (comma-separated list of booleans, defaullt: \texttt{no} on all levels)}
+  \item{ \texttt{MGEigSolverType}: Type of eigensolver to use on a per-level basis. Possible values:
+    \begin{itemize}
+      \item \texttt{tr\_lanczos} (truncated Lanczos)
+      \item \texttt{ir\_arnoldi} (implicitly-restarted Arnoldi)
+    \end{itemize}
+    (comma-separated list of strings, default: \texttt{tr\_lanczos} on all levels)}
+  \item{ \texttt{MGEigSolverSpectrum}: Which part of the spectrum the eigensolver should focus on specified on a per-level basis. Possible values:
+    \begin{itemize}
+      \item \texttt{smallest\_real} (smallest eigenvalues based on their real part)
+      \item \texttt{largest\_real} (largest eigenvalues based on their real part)
+      \item \texttt{smallest\_imaginary} (smallest eigenvalues based on their imaginary part)
+      \item \texttt{largest\_imaginary} (largest eigenvalues based on their imaginary part)
+      \item \texttt{smallest\_modulus} (smallest eigenvalues in modulus)
+      \item \texttt{largest\_modulus} (largest eigenvalues in modulus)
+    \end{itemize}
+    For the purpose of deflating the coarse-grid system, the usual choice is \texttt{smallest\_real}.
+    (comma-separated list of strings, default: \texttt{smallest\_real} on all levels)}
+  \item{ \texttt{MGEigSolverCheckConvergenceInterval}: When using an implicitly-restarted solver, perform this many restarts between convergence checks. (comma-separated list of positive integers, default: $5$ on all levels)}
+  \item{ \texttt{MGEigSolverMaxRestarts}: Perform at most this many restarts in the eigensolver specified on a per-level basis. (comma-separated list of positive integers, default: $100$ on all levels)}
+  \item{ \texttt{MGEigSolverTolerance}: The tolerance to use in the eigensolver on a per-level basis. (comma-separated list of floats, default: $1.0\cdot10^{-6}$ on all levels)}
+  \item{ \texttt{MGEigPreserveDeflationSubspace}: When a coarse-grid-deflated MG setup is updated between a change of the $\mu$ parameter, setting this to \texttt{yes} will preserve the deflation subspace. If set to \texttt{no}, the deflation subspace would be destroyed and regenerated upon a change of $\mu$. (boolean, default: \texttt{yes})}
+  \item{ \texttt{MGEigSolverNumberOfVectors}: Number of eigenvectors to compute on a per-level basis. Usually, this is intended for the coarsest level where, depending on the lattice size and physical parameters, numbers between $512$ and $3072$ may be required. On the levels where \texttt{MGEigSolverRequireConvergence = no} is set, this should be set to the value of \texttt{MGNumberOfVectors} on that level. (comma-separated list of positive integers, default: $0$ on all levels)}
+  \item{ \texttt{MGEigSolverKrylovSubspaceSize}: Size of the Krylov subspace to be used to be built using the eigenvectors. A good rule of thumb is to set this to a small multiple ($2\times$ or $3\times$) of the corresponding value for \texttt{MGEigSolverNumberOfVectors} for the respective level. On levels where \texttt{MGEigSolverRequireConvergence = no} is set, this should be set to the corresponding value of \texttt{MGNumberOfVectors}. (comma-separated list of positive integers, default: $0$ on all levels)}
+  \item{ \texttt{MGEigSolverUsePolynomialAcceleration}: Whether or not to use polynomial acceleration in the eigensolver on a per-level basis. (comma-separated list of booleans, default: \texttt{yes} everywhere)}
+  \item{ \texttt{MGEigSolverPolynomialDegree}: Degree of the polynomial used for polynomial acceleration of the eigensolver, specified on a per-level basis. This will be ignored except if \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set on a given level. Values between $50$ to $100$ seem to work well. (comma-separated list of postive integers, default: $100$ on all levels)}
+  \item{ \texttt{MGEigSolverPolyMin}: Smallest eigenvalue to suppress via polynomial acceleration. It should not be set too low and only lowered step by step from its default value. Has a large impact on the rate of convergence of the eigensolver when set correctly. Ignored unless \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set on the corresponding level. (comma-separated list of positive real numbers, default: $1.0$ on all levels)}
+  \item{ \texttt{MGEigSolverPolyMax}: Largest eigenvalue which should be suppressed via polynomial acceleration. This should be set about 20\% larger than the actual largest eigenvalue of the operator. Ignored unless \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set. (comma-separated list of positive real numbers, default: $5.0$ on all levels)}
+\end{itemize}
+
+A typical coarse-grid-deflated setup might look something like:
+
+\begin{verbatim}
+BeginExternalInverter QUDA
+  EnablePinnedMemoryPool = yes
+  EnableDeviceMemoryPool = no
+  Pipeline = 16
+  gcrNkrylov = 24
+  MGRunVerify = no
+  MGCoarseMuFactor = 1.0, 1.0, 1.0
+  MGNumberOfLevels = 3
+  MGNumberOfVectors = 24, 32
+  MGSetupSolver = cg
+  MGSetup2KappaMu = 0.000200774448
+  MGVerbosity = silent, silent, silent
+  MGBlockSizesX = 4, 2
+  MGBlockSizesY = 4, 2
+  MGBlockSizesZ = 4, 2
+  MGBlockSizesT = 4, 2
+  MGSetupSolverTolerance = 5e-7, 5e-7
+  MGSetupMaxSolverIterations = 1500, 1500
+  MGCoarseSolverType = gcr, gcr, cagcr
+  MgCoarseSolverTolerance = 0.1, 0.1, 0.1
+  MGCoarseMaxSolverIterations = 10, 10, 10
+  MGSmootherType = cagcr, cagcr, cagcr
+  MGSmootherTolerance = 0.1, 0.1, 0.1
+  MGSmootherPreIterations = 0, 0, 0
+  MGSmootherPostIterations = 2, 2, 2
+  MGOverUnderRelaxationFactor = 0.90, 0.90, 0.90
+
+  MGUseEigSolver = no, no, yes
+  MGEigSolverType = tr_lanczos, tr_lanczos, tr_lanczos
+  MGEigSolverSpectrum = smallest_real, smallest_real, smallest_real
+  MGEigPreserveDeflationSubspace = yes
+  MGEigSolverNumberOfVectors = 0, 0, 1024
+  MGEigSolverKrylovSubspaceSize = 0, 0, 3072
+  MGEigSolverRequireConvergence = no, no, yes
+  MGEigSolverMaxRestarts = 0, 0, 100
+  MGEigSolverTolerance = 1e-4, 1e-4, 1e-4
+  MGEigSolverUseNormOp  = no, no, yes
+  MGEigSolverUseDagger = no, no, no
+  MGEigSolverUsePolynomialAcceleration = no, no, yes
+  MGEigSolverPolynomialDegree = 0, 0, 100
+  MGEigSolverPolyMin = 0.0, 0.0, 0.01
+  MGEigSolverPolyMax = 0.0, 0.0, 3.6
+  MGCoarseSolverCABasisSize = 4, 4, 4
+EndExternalInverter
+\end{verbatim}
 
+In order to determine approppriate values for \texttt{MGEigSolverPolyMin} and \texttt{MGEigSolverPolyMax}, one should run the eigensolver once without polynomial acceleration (\texttt{MGEigSolverUsePolynomialAcceleration = no}) and run tmLQCD at \texttt{DebugLevel = 3} to trigger the eigenvalues to be printed by the eigensolver.
+Then, setting \texttt{MGEigSolverSpectrum = largest\_real}, \texttt{MGEigSolverNumberOfVectors = 1} and \texttt{MGEigSolverKrylovSubspaceSize = 16} (all on the coarsest grid, the third number in the example above), one can easily obtain the largest eigenvalue of the coarse grid operator for a particular configuration or set of configurations.
+Increasing this number by 20\% or so gives a good value for \texttt{MGEigSolverPolyMax}.
+Subsequently, setting \texttt{MGEigSolverSpectrum = smallest\_real}, \texttt{MGEigSolverNumberOfVectors = 0, 0, 1024}, \texttt{MGEigSolverKrylovSubspaceSize = 0, 0, 3072} will give the requisite 1024 smallest eigenvalues.
+Noting the largest of these, the next largest order of magnitude can be used for \texttt{MGEigSolverPolyMin}.
+In other words, if the largest of these smallest eigenvalues is $4\cdot10^{-3}$, for example, then \texttt{MGEigSolverPolyMin} can be set to 0.01.
+This ensures that the desired (smallest) part of the spectrum is smaller than \texttt{MGEigSolverPolyMin} and that the entire spectrum is contained in the range up to \texttt{MGEigSolverPolyMax}.
+After this, polynomial acceleration can be enabled, which should reduce setup time significantly.
diff --git a/quda_dummy_types.h b/quda_dummy_types.h
index b689050dc..668237689 100644
--- a/quda_dummy_types.h
+++ b/quda_dummy_types.h
@@ -39,6 +39,11 @@ typedef enum QudaInverterType_s {
  QUDA_CG_INVERTER = CG,
  QUDA_MR_INVERTER = MR,
  QUDA_GCR_INVERTER = GCR,
+ QUDA_CGNE_INVERTER = CGNE,
+ QUDA_CGNR_INVERTER = CGNR,
+ QUDA_CA_CG_INVERTER = CA_CG,
+ QUDA_CA_CGNE_INVERTER = CA_CGNE, 
+ QUDA_CA_CGNR_INVERTER = CA_CGNR,
  QUDA_CA_GCR_INVERTER = CA_GCR
 } QudaInverterType;
 
diff --git a/quda_interface.c b/quda_interface.c
index 6196196db..ebfdba7ec 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -114,6 +114,8 @@
 #include "tm_debug_printf.h"
 #include "phmc.h"
 #include "quda_gauge_paths.inc"
+#include "io/gauge.h"
+#include "measure_gauge_action.h"
 
 // nstore is generally like a gauge id, for measurements it identifies the gauge field
 // uniquely 
@@ -245,7 +247,7 @@ void _setDefaultQudaParam(void){
   QudaPrecision cpu_prec  = QUDA_DOUBLE_PRECISION;
   QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION;
   QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
-  QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION;
+  QudaPrecision cuda_prec_precondition = QUDA_SINGLE_PRECISION;
 
   QudaTune tune = QUDA_TUNE_YES;
 
@@ -1838,15 +1840,18 @@ void _setMGInvertParam(QudaInvertParam * mg_inv_param, const QudaInvertParam * c
   mg_inv_param->cuda_prec = inv_param->cuda_prec;
   mg_inv_param->cuda_prec_sloppy = inv_param->cuda_prec_sloppy;
   mg_inv_param->cuda_prec_refinement_sloppy = inv_param->cuda_prec_refinement_sloppy;
-  mg_inv_param->cuda_prec_precondition = inv_param->cuda_prec_precondition;
-  mg_inv_param->cuda_prec_eigensolver = inv_param->cuda_prec_eigensolver;
   
   mg_inv_param->clover_cpu_prec = inv_param->clover_cpu_prec;
   mg_inv_param->clover_cuda_prec = inv_param->clover_cuda_prec;
   mg_inv_param->clover_cuda_prec_sloppy = inv_param->clover_cuda_prec_sloppy;
   mg_inv_param->clover_cuda_prec_refinement_sloppy = inv_param->clover_cuda_prec_refinement_sloppy;
-  mg_inv_param->clover_cuda_prec_precondition = inv_param->clover_cuda_prec_precondition;
-  mg_inv_param->clover_cuda_prec_eigensolver = inv_param->clover_cuda_prec_eigensolver;
+  
+  // it seems that the MG-internal preconditioner and eigensolver need to be
+  // consistent with sloppy precision
+  mg_inv_param->cuda_prec_precondition = inv_param->cuda_prec_sloppy;
+  mg_inv_param->cuda_prec_eigensolver = inv_param->cuda_prec_sloppy;
+  mg_inv_param->clover_cuda_prec_precondition = inv_param->clover_cuda_prec_sloppy;
+  mg_inv_param->clover_cuda_prec_eigensolver = inv_param->clover_cuda_prec_sloppy;
   
   mg_inv_param->clover_order = inv_param->clover_order;
   mg_inv_param->gcrNkrylov = inv_param->gcrNkrylov;
@@ -1865,6 +1870,9 @@ void _setQudaMultigridParam(QudaMultigridParam* mg_param) {
   QudaInvertParam * mg_inv_param = mg_param->invert_param;
   _setMGInvertParam(mg_inv_param, &inv_param);
 
+  mg_param->setup_type = QUDA_NULL_VECTOR_SETUP;
+
+  mg_param->coarse_guess = quda_input.mg_coarse_guess;
   mg_param->preserve_deflation = quda_input.mg_eig_preserve_deflation;
 
   mg_param->n_level = quda_input.mg_n_level;
@@ -2013,6 +2021,10 @@ void _setQudaMultigridParam(QudaMultigridParam* mg_param) {
     mg_param->coarse_solver_ca_lambda_min[level] = quda_input.mg_coarse_solver_ca_lambda_min[level];
     mg_param->coarse_solver_ca_lambda_max[level] = quda_input.mg_coarse_solver_ca_lambda_max[level];
     
+    mg_param->smoother_solver_ca_basis[level]      = quda_input.mg_smoother_solver_ca_basis[level];
+    mg_param->smoother_solver_ca_lambda_min[level] = quda_input.mg_smoother_solver_ca_lambda_min[level];
+    mg_param->smoother_solver_ca_lambda_max[level] = quda_input.mg_smoother_solver_ca_lambda_max[level];
+   
 
     // set the MG EigSolver parameters, almost equivalent to
     // setEigParam from QUDA's multigrid_invert_test, except
@@ -2035,7 +2047,9 @@ void _setQudaMultigridParam(QudaMultigridParam* mg_param) {
 
       mg_eig_param[level].n_ev = quda_input.mg_eig_nEv[level];
       mg_eig_param[level].n_kr = quda_input.mg_eig_nKr[level];
-      mg_eig_param[level].n_conv = quda_input.mg_n_vec[level];
+      mg_eig_param[level].n_conv = quda_input.mg_eig_nEv[level]; // require convergence of all eigenvalues
+      mg_eig_param[level].n_ev_deflate = mg_eig_param[level].n_conv; // deflate all converged eigenvalues
+      // TODO expose this setting: mg_eig_param[level].batched_rotate = 128;
       mg_eig_param[level].require_convergence = quda_input.mg_eig_require_convergence[level];
 
       mg_eig_param[level].tol = quda_input.mg_eig_tol[level];
@@ -2231,9 +2245,16 @@ int invert_eo_degenerate_quda(spinor * const out,
                                                   rel_prec, even_odd_flag, solver_params,
                                                   sloppy_precision, compression, QpQm);
         if (ret_value >= max_iter) {
+          char outname[200];
+          snprintf(outname, 200, "conf_mg_refresh_fail.%.6f.%04d", g_gauge_state.gauge_id, nstore);
+          paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(
+              measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
+          int status = write_gauge_field(outname, 64, xlfInfo);
+          free(xlfInfo);
+
           char errmsg[200];
           snprintf(errmsg, 200, "QUDA-MG solver failed to converge in %d iterations even after forced setup refresh. Terminating!",
-                   max_iter);
+                 max_iter);
           fatal_error(errmsg, __func__);
           return -1;
         } else {
diff --git a/quda_types.h b/quda_types.h
index 667af91b1..498d6a936 100644
--- a/quda_types.h
+++ b/quda_types.h
@@ -91,6 +91,10 @@ typedef struct tm_QudaParams_t {
   int                  mg_coarse_solver_ca_basis_size[QUDA_MAX_MG_LEVEL];
   double               mg_coarse_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL];
   double               mg_coarse_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL];
+  
+  QudaCABasis          mg_smoother_solver_ca_basis[QUDA_MAX_MG_LEVEL];
+  double               mg_smoother_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL];
+  double               mg_smoother_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL];
 
   // parameters related to coarse grid deflation in the MG
   int                  mg_use_eig_solver[QUDA_MAX_MG_LEVEL];
diff --git a/read_input.l b/read_input.l
index ae84c7541..0fc555013 100644
--- a/read_input.l
+++ b/read_input.l
@@ -1739,10 +1739,33 @@ static inline double fltlist_next_token(int * const list_end){
     quda_input.mg_setup_2kappamu = c;
     if(myverbose) printf("  MGSetup2KappaMu set to %e line %d\n", quda_input.mg_setup_2kappamu, line_of_file);
   }
+  /* TODO: the MGSetupSolver should be set on a per-level basis
+           allowing communication-avoiding solvers to be used
+           on the coarser grids */
   {SPC}*MGSetupSolver{EQL}cg {
     quda_input.mg_setup_inv_type = QUDA_CG_INVERTER;
     if(myverbose) printf("  MGSetupSolver set to CG line %d\n", line_of_file);
   }
+  {SPC}*MGSetupSolver{EQL}cgne {
+    quda_input.mg_setup_inv_type = QUDA_CGNE_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CGNE line %d\n", line_of_file);
+  }
+  {SPC}*MGSetupSolver{EQL}cgnr {
+    quda_input.mg_setup_inv_type = QUDA_CGNE_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CGNR line %d\n", line_of_file);
+  }
+  {SPC}*MGSetupSolver{EQL}cacg {
+    quda_input.mg_setup_inv_type = QUDA_CA_CG_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CA-CG line %d\n", line_of_file);
+  }
+  {SPC}*MGSetupSolver{EQL}cacgne {
+    quda_input.mg_setup_inv_type = QUDA_CA_CGNE_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CA-CGNE line %d\n", line_of_file);
+  }
+  {SPC}*MGSetupSolver{EQL}cacgnr {
+    quda_input.mg_setup_inv_type = QUDA_CA_CGNR_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CA-CGNR line %d\n", line_of_file);
+  }
   {SPC}*MGSetupSolver{EQL}bicgstab {
     quda_input.mg_setup_inv_type = QUDA_BICGSTAB_INVERTER;
     if(myverbose) printf("  MGSetupSolver set to BiCGstab line %d\n", line_of_file);
@@ -1821,6 +1844,14 @@ static inline double fltlist_next_token(int * const list_end){
     quda_input.mg_reset_setup_mdu_threshold = c;
     if(myverbose) printf("  MGResetSetupMDUThreshold set to %f line %d\n", quda_input.mg_reset_setup_mdu_threshold, line_of_file);
   }
+  {SPC}*MGCoarseGuess{EQL}yes {
+    quda_input.mg_coarse_guess = QUDA_BOOLEAN_YES;
+    if(myverbose) printf("  MGCoarseGuess set to YES in line %d\n", line_of_file);
+  }
+  {SPC}*MGCoarseGuess{EQL}no {
+    quda_input.mg_coarse_guess = QUDA_BOOLEAN_NO;
+    if(myverbose) printf("  MGCoarseGuess set to NO in line %d\n", line_of_file);
+  }
   {SPC}*MGUseEigSolver{EQL}{STRLIST} {
     parse_quda_mg_bool_par_array(yytext, &(quda_input.mg_use_eig_solver[0]));
   }
@@ -1912,14 +1943,6 @@ static inline double fltlist_next_token(int * const list_end){
     quda_input.mg_eig_preserve_deflation = QUDA_BOOLEAN_YES;
     if(myverbose) printf("  MGEigPreserveDeflationSubspace set to YES in line %d\n", line_of_file);
   }
-  {SPC}*MGEigSolverCoarseGuess{EQL}yes {
-    quda_input.mg_coarse_guess = QUDA_BOOLEAN_YES;
-    if(myverbose) printf("  MGEigSolverCoarseGuess set to YES in line %d\n", line_of_file);
-  }
-  {SPC}*MGEigSolverCoarseGuess{EQL}no {
-    quda_input.mg_coarse_guess = QUDA_BOOLEAN_NO;
-    if(myverbose) printf("  MGEigSolverCoarseGuess set to NO in line %d\n", line_of_file);
-  }
   {SPC}*MGEigSolverNumberOfVectors{EQL}{STRLIST} {
     parse_quda_mg_int_par_array(yytext, &(quda_input.mg_eig_nEv[0]));
   }
@@ -1968,6 +1991,15 @@ static inline double fltlist_next_token(int * const list_end){
   {SPC}*MGSetupCABasisLambdaMax{EQL}{STRLIST} {
     parse_quda_mg_dbl_par_array(yytext, &(quda_input.mg_setup_ca_lambda_max[0]));
   }
+  {SPC}*MGSmootherSolverCABasisType{EQL}{STRLIST} {
+    parse_quda_mg_cabasis_par_array(yytext, &(quda_input.mg_smoother_solver_ca_basis[0]));
+  }
+  {SPC}*MGSmootherSolverCABasisLambdaMin{EQL}{STRLIST} {
+    parse_quda_mg_dbl_par_array(yytext, &(quda_input.mg_smoother_solver_ca_lambda_min[0]));
+  }
+  {SPC}*MGSmootherSolverCABasisLambdaMax{EQL}{STRLIST} {
+    parse_quda_mg_dbl_par_array(yytext, &(quda_input.mg_smoother_solver_ca_lambda_max[0]));
+  }
   {SPC}*MGCoarseSolverCABasisType{EQL}{STRLIST} {
     parse_quda_mg_cabasis_par_array(yytext, &(quda_input.mg_coarse_solver_ca_basis[0]));
   }
@@ -3903,6 +3935,7 @@ int read_input(char * conf_file){
   quda_input.fermionbc = TM_QUDA_THETABC;
   quda_input.pipeline = 0;
   quda_input.gcrNkrylov = 10;
+  quda_input.mg_coarse_guess = QUDA_BOOLEAN_NO;
   quda_input.reliable_delta = 1e-3; // anything smaller than this and we break down in double-half
   quda_input.mg_n_level = _default_quda_mg_n_level;
   quda_input.mg_setup_2kappamu = _default_quda_mg_setup_2kappamu;
@@ -3921,11 +3954,11 @@ int read_input(char * conf_file){
     quda_input.mg_n_vec[level] = _default_quda_mg_n_vec;
     quda_input.mg_mu_factor[level] = 1.0;
     quda_input.mg_coarse_solver_type[level] = QUDA_GCR_INVERTER;
-    quda_input.mg_smoother_type[level] = QUDA_MR_INVERTER ;
+    quda_input.mg_smoother_type[level] = QUDA_CA_GCR_INVERTER;
 
     quda_input.mg_use_eig_solver[level] = QUDA_BOOLEAN_NO;
     quda_input.mg_eig_preserve_deflation = QUDA_BOOLEAN_NO;
-    quda_input.mg_eig_tol[level] = 1.0e-3;
+    quda_input.mg_eig_tol[level] = 1.0e-6;
     quda_input.mg_eig_require_convergence[level] = QUDA_BOOLEAN_YES;
     quda_input.mg_eig_type[level] = QUDA_EIG_TR_LANCZOS;
     quda_input.mg_eig_spectrum[level] = QUDA_SPECTRUM_SR_EIG;
@@ -3937,6 +3970,8 @@ int read_input(char * conf_file){
     quda_input.mg_eig_poly_deg[level] = 100;
     quda_input.mg_eig_amin[level] = 1.0;
     quda_input.mg_eig_amax[level] = 5.0;
+    quda_input.mg_eig_nEv[level] = 0;
+    quda_input.mg_eig_nKr[level] = 0;
 
     quda_input.mg_setup_ca_basis[level] = QUDA_POWER_BASIS;
     quda_input.mg_setup_ca_basis_size[level] = 4;
@@ -3947,6 +3982,10 @@ int read_input(char * conf_file){
     quda_input.mg_coarse_solver_ca_basis_size[level] = 4;
     quda_input.mg_coarse_solver_ca_lambda_min[level] = 0.0;
     quda_input.mg_coarse_solver_ca_lambda_max[level] = -1.0;
+    
+    quda_input.mg_smoother_solver_ca_basis[level] = QUDA_POWER_BASIS;
+    quda_input.mg_smoother_solver_ca_lambda_min[level] = 0.0;
+    quda_input.mg_smoother_solver_ca_lambda_max[level] = -1.0;
 
     /* note: when the user does not specify any blocking parameters,
      * a reasonable set will be computed automatically in the MG setup
diff --git a/solver/monomial_solve.c b/solver/monomial_solve.c
index 66684bb85..805899a5d 100644
--- a/solver/monomial_solve.c
+++ b/solver/monomial_solve.c
@@ -82,13 +82,24 @@
 #endif
 #include "fatal_error.h"
 
-#include <io/params.h>
-#include <io/spinor.h>
+#include "io/params.h"
+#include "io/spinor.h"
+#include "io/gauge.h"
+#include "measure_gauge_action.h"
 
 #ifdef TM_USE_QUDA
 #  include "quda_interface.h"
 #endif
 
+void solve_fail_write_config_and_abort(const char * const solver) {
+  char outname[200];
+  snprintf(outname, 200, "conf_monomial_solve_fail.%.6f.%04d", g_gauge_state.gauge_id, nstore);
+  paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
+  int status = write_gauge_field(outname, 64, xlfInfo);
+  free(xlfInfo);
+  fatal_error("Error: solver reported -1 iterations.", solver);
+}
+
 int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_params,
                      const int max_iter, double eps_sq, const int rel_prec, 
                      const int N, matrix_mult f, int solver_type){
@@ -216,7 +227,7 @@ int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    fatal_error("Error: solver reported -1 iterations.", "solve_degenerate");
+    solve_fail_write_config_and_abort("solve_degenerate");
   }
 
   return (iteration_count);
@@ -425,7 +436,7 @@ int solve_mms_tm(spinor ** const P, spinor * const Q,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    fatal_error("Error: solver reported -1 iterations.", "solve_mms_tm");
+    solve_fail_write_config_and_abort("solve_mms_tm");
   }
 
   return(iteration_count);
@@ -671,7 +682,7 @@ int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    fatal_error("Error: solver reported -1 iterations.", "solve_mms_nd");
+    solve_fail_write_config_and_abort("solve_mms_nd");
   }
 
   return (iteration_count);
@@ -726,7 +737,7 @@ int solve_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    fatal_error("Error: solver reported -1 iterations.", "solve_mms_nd_plus");
+    solve_fail_write_config_and_abort("solve_mms_nd_plus");
   }
 
   return iteration_count;
diff --git a/solver/solver_types.h b/solver/solver_types.h
index 0538481ab..7cd1a52e1 100644
--- a/solver/solver_types.h
+++ b/solver/solver_types.h
@@ -48,6 +48,11 @@ typedef enum SOLVER_TYPE {
  MIXEDBICGSTAB,
  DUMMYHERMTEST,
  CA_GCR,
+ CGNE,
+ CGNR,
+ CA_CG,
+ CA_CGNE,
+ CA_CGNR,
  INVALID_SOLVER
 } SOLVER_TYPE;