diff --git a/dlib/optimization/optimization_solve_qp_using_smo.h b/dlib/optimization/optimization_solve_qp_using_smo.h
index f205b71cd..b9cb8dcef 100644
--- a/dlib/optimization/optimization_solve_qp_using_smo.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo.h
@@ -550,6 +550,57 @@ namespace dlib
         return iter+1;
     }
 
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        // Check if each vector in Q_offdiag is actually a constant times the 1s vector.
+        template <
+            typename T, long NR, long NC, typename MM, typename L
+            >
+        bool has_uniform_offdiag_vectors(
+            const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag
+        )
+        {
+            for (auto& x : Q_offdiag)
+            {
+                auto ref = x.second(0);
+                for (auto& y : x.second)
+                    if (ref != y)
+                        return false;
+            }
+            return true;
+        }
+
+        template <
+            typename T, long NR, long NC, typename MM, typename L
+            >
+        matrix<T,0,0,MM,L> compact_offdiag(
+            const size_t& num_blocks,
+            const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag
+        )
+        {
+            matrix<T,0,0,MM,L> temp;
+            // we can only compact the offdiag information if they are uniform vectors
+            if (!has_uniform_offdiag_vectors(Q_offdiag))
+                return temp;
+
+            temp.set_size(num_blocks, num_blocks);
+            temp = 0;
+
+            for (auto& x : Q_offdiag)
+            {
+                long r = x.first.first;
+                long c = x.first.second;
+                temp(r,c) = x.second(0);
+                temp(c,r) = x.second(0);
+            }
+
+            return temp;
+        }
+    }
+
 // ----------------------------------------------------------------------------------------
 
     template <
@@ -624,6 +675,8 @@ namespace dlib
 #endif // ENABLE_ASSERTS
 
 
+        const auto offdiag_compact = impl::compact_offdiag(Q_blocks.size(), Q_offdiag);
+        matrix<T,0,0,MM,L> temp, alphas_compact;
 
         // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.  
         std::vector<matrix<T,NR,NC,MM,L>> df;// = Q*alpha + b;
@@ -632,18 +685,41 @@ namespace dlib
             df.resize(Q_blocks.size());
             for (size_t i = 0; i < df.size(); ++i)
                 df[i] = Q_blocks[i]*alphas[i] + bs[i];
-            // Don't forget to include the Q_offdiag terms in the computation
-            for (auto& p : Q_offdiag)
+
+
+            // Don't forget to include the Q_offdiag terms in the computation.  Note that
+            // we have two options for how we can compute this part.  If Q_offdiag is
+            // uniform and can be compacted into a simple matrix and there are a lot of off
+            // diagonal entries then it's faster to do it as a matrix multiply.  Otherwise
+            // we do the more general computation.
+            if (offdiag_compact.size() != 0 && Q_offdiag.size() > Q_blocks.size()*5)
             {
-                long r = p.first.first;
-                long c = p.first.second;
-                df[r] += pointwise_multiply(p.second, alphas[c]);
-                if (r != c)
-                    df[c] += pointwise_multiply(p.second, alphas[r]);
+                // Do it as a matrix multiply (with a bit of data shuffling)
+                alphas_compact.set_size(alphas[0].size(), offdiag_compact.nr());
+                for (long c = 0; c < alphas_compact.nc(); ++c)
+                    set_colm(alphas_compact,c) = alphas[c];
+                temp = alphas_compact*offdiag_compact;
+                for (size_t i = 0; i < df.size(); ++i)
+                    df[i] += colm(temp,i);
+            }
+            else
+            {
+                // Do the fully general computation that allows for non-uniform values in
+                // the off diagonal vectors.
+                for (auto& p : Q_offdiag)
+                {
+                    long r = p.first.first;
+                    long c = p.first.second;
+                    df[r] += pointwise_multiply(p.second, alphas[c]);
+                    if (r != c)
+                        df[c] += pointwise_multiply(p.second, alphas[r]);
+                }
             }
         };
         compute_df();
 
+
+
         std::vector<matrix<T,NR,NC,MM,L>> Q_diag, Q_ggd;
         std::vector<matrix<T,NR,NC,MM,L>> QQ;// = reciprocal_max(diag(Q));
         QQ.resize(Q_blocks.size());
diff --git a/dlib/test/opt_qp_solver.cpp b/dlib/test/opt_qp_solver.cpp
index 8a95bff47..546406873 100644
--- a/dlib/test/opt_qp_solver.cpp
+++ b/dlib/test/opt_qp_solver.cpp
@@ -582,6 +582,96 @@ namespace
         }
     }
 
+// ----------------------------------------------------------------------------------------
+
+    void test_solve_qp_box_constrained_blockdiag_compact(dlib::rand& rnd, double percent_off_diag_present)
+    {
+        print_spinner();
+
+        dlog << LINFO << "test_solve_qp_box_constrained_blockdiag_compact(), percent_off_diag_present==" << percent_off_diag_present;
+
+        std::map<unordered_pair<size_t>, matrix<double,0,1>> offdiag;
+        std::vector<matrix<double>> Q_blocks;
+        std::vector<matrix<double,0,1>> bs;
+
+        const long num_blocks = 20;
+        const long dims = 4;
+        const double lambda = 10;
+        for (long i = 0; i < num_blocks; ++i)
+        {
+            matrix<double> Q1;
+            matrix<double,0,1> b1;
+            Q1 = randm(dims,dims,rnd); Q1 = Q1*trans(Q1);
+            b1 = gaussian_randm(dims,1, i);
+
+            Q_blocks.push_back(Q1);
+            bs.push_back(b1);
+
+            // test with some graph regularization terms
+            for (long j = 0; j < num_blocks; ++j)
+            {
+                if (rnd.get_random_double() < percent_off_diag_present)
+                {
+                    if (i==j)
+                        offdiag[make_unordered_pair(i,j)] = (num_blocks-1)*lambda*rnd.get_random_double()*ones_matrix<double>(dims,1);
+                    else
+                        offdiag[make_unordered_pair(i,j)] = -lambda*rnd.get_random_double()*ones_matrix<double>(dims,1);
+                }
+            }
+        }
+
+        // build out the dense version of the QP so we can test it against the dense solver.
+        matrix<double> Q(num_blocks*dims, num_blocks*dims); 
+        Q = 0;
+        matrix<double,0,1> b(num_blocks*dims);
+        for (long i = 0; i < num_blocks; ++i)
+        {
+            set_subm(Q,i*dims,i*dims,dims,dims) = Q_blocks[i];
+            set_subm(b,i*dims,0,dims,1) = bs[i];
+        }
+        for (auto& p : offdiag)
+        {
+            long r = p.first.first;
+            long c = p.first.second;
+            set_subm(Q, dims*r,dims*c, dims,dims) += diagm(p.second);
+            if (c != r)
+                set_subm(Q, dims*c,dims*r, dims,dims) += diagm(p.second);
+        }
+
+
+
+        matrix<double,0,1> alpha = zeros_matrix<double>(dims*num_blocks,1);
+        matrix<double,0,1> lower = -10000*ones_matrix<double>(dims*num_blocks,1);
+        matrix<double,0,1> upper = 10000*ones_matrix<double>(dims*num_blocks,1);
+
+        auto iters = solve_qp_box_constrained(Q, b, alpha, lower, upper, 1e-9, 20000);
+        dlog << LINFO << "iters: "<< iters;
+
+
+        matrix<double,0,1> init_alpha = zeros_matrix(bs[0]);
+        lower = -10000*ones_matrix(bs[0]);
+        upper = 10000*ones_matrix(bs[0]);
+
+        std::vector<matrix<double,0,1>> alphas(num_blocks, init_alpha);
+        std::vector<matrix<double,0,1>> lowers(num_blocks, lower);
+        std::vector<matrix<double,0,1>> uppers(num_blocks, upper);
+
+        auto iters2 = solve_qp_box_constrained_blockdiag(Q_blocks, bs, offdiag, alphas, lowers, uppers, 1e-9, 20000);
+        dlog << LINFO << "iters2: "<< iters2;
+
+
+        const matrix<double> refalpha = reshape(alpha, num_blocks, dims);
+
+        // now make sure the two solvers agree on the outputs.
+        for (long r = 0; r < num_blocks; ++r)
+        {
+            for (long c = 0; c < dims; ++c)
+            {
+                DLIB_TEST_MSG(std::abs(refalpha(r,c) - alphas[r](c)) < 1e-6, std::abs(refalpha(r,c) - alphas[r](c)));
+            }
+        }
+    }
+
 // ----------------------------------------------------------------------------------------
 
     class opt_qp_solver_tester : public tester
@@ -642,6 +732,15 @@ namespace
 
             test_find_gap_between_convex_hulls();
             test_solve_qp_box_constrained_blockdiag();
+
+            // try a range of off diagonal sparseness.  We do this to make sure we exercise both
+            // the compact and sparse code paths within the solver.
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 0.001);
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 0.01);
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 0.04);
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 0.10);
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 0.50);
+            test_solve_qp_box_constrained_blockdiag_compact(rnd, 1.00);
         }
 
         double do_the_test (