diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index bae786e4b..3f93144da 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -33,7 +33,7 @@ jobs:
           -v $GITHUB_WORKSPACE:/workspace \
           -v ~/.cache/pip:/root/.cache/pip \
           pyscf/gpu4pyscf-devel:pyscf-2.8 \
-          /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
+          /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && pip3 install MCfun && source build.sh && pytest -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
 
   multi-gpu:
     runs-on: [self-hosted, Linux, X64, 2T4]
diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py
index 8d83e3f92..bd9259a6f 100644
--- a/gpu4pyscf/dft/gks.py
+++ b/gpu4pyscf/dft/gks.py
@@ -12,19 +12,166 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import cupy as cp
+from pyscf import lib
 from pyscf.dft import gks
-from gpu4pyscf.dft import numint
+from gpu4pyscf.dft import numint2c
 from gpu4pyscf.dft import rks
 from gpu4pyscf.scf.ghf import GHF
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib import utils
+from gpu4pyscf.lib.cupy_helper import tag_array
 
-class GKS(gks.GKS, GHF):
-    from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
+
+def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
+    '''Coulomb + XC functional
+
+    .. note::
+        This function will change the ks object.
+
+    Args:
+        ks : an instance of :class:`RKS`
+            XC functional are controlled by ks.xc attribute.  Attribute
+            ks.grids might be initialized.
+        dm : ndarray or list of ndarrays
+            A density matrix or a list of density matrices
+
+    Kwargs:
+        dm_last : ndarray or a list of ndarrays or 0
+            The density matrix baseline.  If not 0, this function computes the
+            increment of HF potential w.r.t. the reference HF potential matrix.
+        vhf_last : ndarray or a list of ndarrays or 0
+            The reference Vxc potential matrix.
+        hermi : int
+            Whether J, K matrix is hermitian
+
+            | 0 : no hermitian or symmetric
+            | 1 : hermitian
+            | 2 : anti-hermitian
+
+    Returns:
+        matrix Veff = J + Vxc.  Veff can be a list matrices, if the input
+        dm is a list of density matrices.
+    '''
+    if mol is None: mol = ks.mol
+    if dm is None: dm = ks.make_rdm1()
+    ks.initialize_grids(mol, dm)
+
+    t0 = (logger.process_clock(), logger.perf_counter())
+
+    ground_state = isinstance(dm, cp.ndarray) and dm.ndim == 2
+
+    if hermi == 2:  # because rho = 0
+        n, exc, vxc = 0, 0, 0
+    else:
+        max_memory = ks.max_memory - lib.current_memory()[0]
+        ni = ks._numint
+        if ni.collinear[0].lower() != 'm':
+            raise NotImplementedError('Only multi-colinear GKS is implemented')
+        n, exc, vxc = ni.get_vxc(mol, ks.grids, ks.xc, dm,
+                                 hermi=hermi, max_memory=max_memory)
+        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        if ks.do_nlc():
+            if ni.libxc.is_nlc(ks.xc):
+                xc = ks.xc
+            else:
+                assert ni.libxc.is_nlc(ks.nlc)
+                xc = ks.nlc
+            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm,
+                                          hermi=hermi, max_memory=max_memory)
+            exc += enlc
+            vxc += vnlc
+            logger.debug(ks, 'nelec with nlc grids = %s', n)
+        t0 = logger.timer(ks, 'vxc', *t0)
+
+    incremental_jk = (ks._eri is None and ks.direct_scf and
+                      getattr(vhf_last, 'vj', None) is not None)
+    if incremental_jk:
+        _dm = cp.asarray(dm) - cp.asarray(dm_last)
+    else:
+        _dm = dm
+    if not ni.libxc.is_hybrid_xc(ks.xc):
+        vk = None
+        vj = ks.get_j(mol, _dm, hermi)
+        if incremental_jk:
+            vj += vhf_last.vj
+        vxc += vj
+    else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
+        if omega == 0:
+            vj, vk = ks.get_jk(mol, _dm, hermi)
+            vk *= hyb
+        elif alpha == 0: # LR=0, only SR exchange
+            vj = ks.get_j(mol, _dm, hermi)
+            vk = ks.get_k(mol, _dm, hermi, omega=-omega)
+            vk *= hyb
+        elif hyb == 0: # SR=0, only LR exchange
+            vj = ks.get_j(mol, _dm, hermi)
+            vk = ks.get_k(mol, _dm, hermi, omega=omega)
+            vk *= alpha
+        else: # SR and LR exchange with different ratios
+            vj, vk = ks.get_jk(mol, _dm, hermi)
+            vk *= hyb
+            vklr = ks.get_k(mol, _dm, hermi, omega=omega)
+            vklr *= (alpha - hyb)
+            vk += vklr
+        if incremental_jk:
+            vj += vhf_last.vj
+            vk += vhf_last.vk
+        vxc += vj - vk
+
+        if ground_state:
+            exc -= cp.einsum('ij,ji', dm, vk).real * .5
+
+    if ground_state:
+        ecoul = cp.einsum('ij,ji', dm, vj).real * .5
+    else:
+        ecoul = None
+
+    vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=vj, vk=vk)
+    return vxc
+
+
+class GKS(rks.KohnShamDFT, GHF):
+    to_gpu = utils.to_gpu
+    device = utils.device
 
     def __init__(self, mol, xc='LDA,VWN'):
-        raise NotImplementedError
+        GHF.__init__(self, mol)
+        rks.KohnShamDFT.__init__(self, xc)
+        self._numint = numint2c.NumInt2C()
+
+    def dump_flags(self, verbose=None):
+        GHF.dump_flags(self, verbose)
+        rks.KohnShamDFT.dump_flags(self, verbose)
+        logger.info(self, 'collinear = %s', self._numint.collinear)
+        if self._numint.collinear[0] == 'm':
+            logger.info(self, 'mcfun spin_samples = %s', self._numint.spin_samples)
+            logger.info(self, 'mcfun collinear_thrd = %s', self._numint.collinear_thrd)
+            logger.info(self, 'mcfun collinear_samples = %s', self._numint.collinear_samples)
+        return self
+
+    @property
+    def collinear(self):
+        return self._numint.collinear
+    @collinear.setter
+    def collinear(self, val):
+        self._numint.collinear = val
 
+    @property
+    def spin_samples(self):
+        return self._numint.spin_samples
+    @spin_samples.setter
+    def spin_samples(self, val):
+        self._numint.spin_samples = val
+    
+    get_veff = get_veff
     reset = rks.RKS.reset
     energy_elec = rks.RKS.energy_elec
-    get_veff = NotImplemented
     nuc_grad_method = NotImplemented
     to_hf = NotImplemented
+    
+    def to_cpu(self):
+        mf = gks.GKS(self.mol)
+        utils.to_cpu(self, out=mf)
+        return mf
diff --git a/gpu4pyscf/dft/mcfun_gpu.py b/gpu4pyscf/dft/mcfun_gpu.py
new file mode 100644
index 000000000..ed010311c
--- /dev/null
+++ b/gpu4pyscf/dft/mcfun_gpu.py
@@ -0,0 +1,380 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import warnings
+import cupy as cp
+import numpy as np
+from pyscf.dft.LebedevGrid import MakeAngularGrid
+
+MAX_GRIDS_PER_TASK = 4096
+
+def eval_xc_eff(func, rho_tm, deriv=1, spin_samples=770,
+                collinear_threshold=None, collinear_samples=200):
+    '''Multi-collinear effective potential and effective kernel.
+
+    Parameters
+    ----------
+    func : Function to evaluate collinear functionals.
+        The function signature is
+            exc, vxc, fxc, ... = func((rho, s), deriv)
+        The input rho and s have shape
+        * (1, Ngrids) for LDA functionals
+        * (4, Ngrids) for GGA functionals where the four variables are
+        rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * (5, Ngrids) for meta-GGA functionals where the five variables are
+        rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+        The returns of func should have the shape
+        * (2, Nvar, Ngrids) for vxc
+        * (2, Nvar, 2, Nvar, Ngrids) for fxc
+        * (2, Nvar, 2, Nvar, 2, Nvar, Ngrids) for kxc
+        The dimension Nvar can be
+        * 1 for LDA
+        * 4 for GGA: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * 5 for meta-GGA: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+        Note: for GGA functionals the required returns have different conventions
+        to the returns of libxc or xcfun in functional derivatives. Here func
+        needs to return the derivatives to rho, nabla_x(rho), nabla_y(rho),
+        nabla_z(rho) while libxc (or xcfun) returns derivatives to rho, sigma_uu,
+        sigma_ud, sigma_dd (sigma_ud = nabla(rho_u) dot nabla(rho_d)). Please
+        see example 04-xc_wrapper.py for the transformation between the two
+        conventions.
+    rho_tm : cp array with shape (4, Nvar, Ngrids)
+        rho_tm[0] is density. rho_tm[1:4] is the magnetization spin vector. Nvar can be
+        * 1 for LDA functionals
+        * 4 for GGA functionals: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * 5 for meta-GGA functionals: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+    deriv : int, optional
+        Functional derivatives. The current version (the kernel) supports maximum value 2.
+    spin_samples : int, optional
+        Number of grid points on spherical surface
+    collinear_threshold : float, optional
+        if specified, filters the points of strongly polarized spins and calls the
+        eval_xc_collinear_spin for these points. Recommended value 0.99.
+    collinear_samples : int, optional
+        Number of samples for eval_xc_collinear_spin.
+
+    Returns
+    -------
+    XC functional and derivatives for each grid.
+    * [exc] if deriv = 0
+    * [exc, vxc] if deriv = 1
+    * [exc, vxc, fxc] if deriv = 2
+    '''
+
+    assert deriv < 5
+    rho_tm = cp.asarray(rho_tm)
+    if rho_tm.dtype != cp.double:
+        raise RuntimeError('rho and m must be real')
+    ngrids = rho_tm.shape[-1]
+
+    results = []
+    for p0, p1 in _prange(0, ngrids, MAX_GRIDS_PER_TASK):
+        r = _eval_xc_lebedev(func, rho_tm[...,p0:p1], deriv, spin_samples,
+                             collinear_threshold, collinear_samples)
+        results.append(r)
+
+    return [None if x[0] is None else cp.concatenate(x, axis=-1) for x in zip(*results)]
+
+
+def eval_xc_collinear_spin(func, rho_tm, deriv, spin_samples):
+    '''Multi-collinear functional derivatives for collinear spins
+
+    Parameters
+    ----------
+    func : Function to evaluate collinear functionals.
+        The function signature is
+            exc, vxc, fxc, ... = func((rho, s), deriv)
+        The input rho and s have shape
+        * (1, Ngrids) for LDA functionals
+        * (4, Ngrids) for GGA functionals where the four variables are
+        rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * (5, Ngrids) for meta-GGA functionals where the five variables are
+        rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+        The returns of func should have the shape
+        * (2, Nvar, Ngrids) for vxc
+        * (2, Nvar, 2, Nvar, Ngrids) for fxc
+        * (2, Nvar, 2, Nvar, 2, Nvar, Ngrids) for kxc
+        The dimension Nvar can be
+        * 1 for LDA
+        * 4 for GGA: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * 5 for meta-GGA: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+        Note: for GGA functionals the required returns have different conventions
+        to the returns of libxc or xcfun in functional derivatives. Here func
+        needs to return the derivatives to rho, nabla_x(rho), nabla_y(rho),
+        nabla_z(rho) while libxc (or xcfun) returns derivatives to rho, sigma_uu,
+        sigma_ud, sigma_dd (sigma_ud = nabla(rho_u) dot nabla(rho_d)).
+    rho_tm : cp array with shape (4, Nvar, Ngrids)
+        rho_tm[0] is density. rho_tm[1:4] is the magnetization spin vector. Nvar can be
+        * 1 for LDA functionals
+        * 4 for GGA functionals: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * 5 for meta-GGA functionals: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+    deriv : int, optional
+        Functional derivatives. The current version (the kernel) supports maximum value 2.
+    spin_samples : int, optional
+        Number of grid points on principal axis
+
+    Returns
+    -------
+    XC functional and derivatives for each grid.
+    * [exc] if deriv = 0
+    * [exc, vxc] if deriv = 1
+    * [exc, vxc, fxc] if deriv = 2
+    '''
+    ngrids = rho_tm.shape[-1]
+    # samples on z=cos(theta) and their weights between [0, 1]
+    sgridz, weights = _make_paxis_samples(spin_samples)
+    blksize = int(cp.ceil(1e5 / ngrids)) * 8
+
+    if rho_tm.ndim == 2:
+        nvar = 1
+    else:
+        nvar = rho_tm.shape[1]
+
+    rho_ts = _project_spin_paxis(rho_tm)
+
+    # TODO: filter s, nabla(s) ~= 0
+    m = rho_tm[1:].reshape(3, nvar, ngrids)
+    s = rho_ts[1].reshape(nvar, ngrids)[0]
+    omega = cp.where(s == 0, 0, m[:, 0] / s)
+
+    xc_orig = func(rho_ts, deriv)
+    exc_eff = xc_orig[0]
+    exc_eff = exc_eff[:,0]
+
+    omega = omega.reshape(3, ngrids)
+    if deriv > 0:
+        vxc = xc_orig[1].reshape(2, nvar, ngrids)
+        vxc_eff = cp.vstack((vxc[:1], cp.einsum('xg,rg->rxg', vxc[1], omega)))
+
+    if deriv > 1:
+        # spin-conserve part
+        fxc = xc_orig[2].reshape(2, nvar, 2, nvar, ngrids)
+        fxc_eff = cp.empty((4, nvar, 4, nvar, ngrids))
+        fxc_eff[0,:,0] = fxc[0,:,0]
+        fz1 = cp.einsum('xyg,rg->rxyg', fxc[1,:,0], omega)
+        fxc_eff[1:,:,0 ] = fz1
+        fxc_eff[0 ,:,1:] = fz1.transpose(1,0,2,3)
+        tmp = cp.einsum('xyg,rg->rxyg', fxc[1,:,1,:], omega)
+        fxc_eff[1:,:,1:] = cp.einsum('rxyg,sg->rxsyg', tmp, omega)
+
+        # spin-flip part
+        fxc_sf = 0
+        for p0, p1 in _prange(0, weights.size, blksize):
+            rho = _project_spin_paxis(rho_tm, sgridz[p0:p1])
+            fxc = func(rho, deriv)[2]
+            fxc = fxc.reshape(2, nvar, 2, nvar, ngrids, p1 - p0)
+            # only considers the xx+yy part
+            fxc_sf += fxc[1,:,1].dot(weights[p0:p1])
+
+        for i in range(1, 4):
+            fxc_eff[i,:,i] += fxc_sf
+        tmp = cp.einsum('xyg,rg->rxyg', fxc_sf, omega)
+        fxc_eff[1:,:,1:] -= cp.einsum('rxyg,sg->rxsyg', tmp, omega)
+
+    ret = [exc_eff]
+    if deriv > 0:
+        ret.append(vxc_eff)
+    if deriv > 1:
+        ret.append(fxc_eff)
+    if deriv > 2:
+        raise NotImplementedError
+    return ret
+
+def _eval_xc_lebedev(func, rho_tm, deriv, spin_samples,
+                     collinear_threshold=None, collinear_samples=200):
+    '''Multi-collinear effective potential and effective kernel with projection
+    samples on spherical surface (the Lebedev grid samples)
+    '''
+    ngrids = rho_tm.shape[-1]
+    sgrids, weights = _make_sph_samples(spin_samples)
+    sgrids = cp.asarray(sgrids)
+    weights = cp.asarray(weights)
+    blksize = int(cp.ceil(1e4 / ngrids)) * 8
+    # import pdb
+    # pdb.set_trace()
+    if rho_tm.ndim == 2:
+        nvar = 1
+    else:
+        nvar = rho_tm.shape[1]
+    exc_eff = vxc_eff = fxc_eff = kxc_eff = 0
+    for p0, p1 in _prange(0, weights.size, blksize):
+        nsg = p1 - p0
+        p_sgrids = sgrids[p0:p1]
+        p_weights = weights[p0:p1]
+        rho = _project_spin_sph(rho_tm, p_sgrids)
+
+        xc_orig = func(rho, deriv+1)
+
+        exc = xc_orig[0].reshape(ngrids, nsg)
+        vxc = xc_orig[1].reshape(2, nvar, ngrids, nsg)
+
+        rho = rho.reshape(2, nvar, ngrids, nsg)
+        s = rho[1]
+        rho_pure = rho[0,0]
+        exc_rho = exc * rho_pure + cp.einsum('xgo,xgo->go', vxc[1], s)
+        exc_eff += cp.einsum('go,o->g', exc_rho, p_weights)
+
+        if deriv > 0:
+            fxc = xc_orig[2].reshape(2, nvar, 2, nvar, ngrids, nsg)
+            # vs * 2 + s*f_s_st
+            vxc[1] *= 2
+            vxc += cp.einsum('xbygo,xgo->bygo', fxc[1], s)
+            c_tm = _ts2tm_transformation(p_sgrids)
+            cw_tm = c_tm * p_weights
+            vxc_eff += cp.einsum('rao,axgo->rxg', cw_tm, vxc)
+
+        if deriv > 1:
+            kxc = xc_orig[3].reshape(2, nvar, 2, nvar, 2, nvar, ngrids, nsg)
+            fxc[1,:,1] *= 3
+            fxc[0,:,1] *= 2
+            fxc[1,:,0] *= 2
+            fxc += cp.einsum('xbyczgo,xgo->byczgo', kxc[1], s)
+            fxc = cp.einsum('rao,axbygo->rxbygo', c_tm, fxc)
+            fxc_eff += cp.einsum('sbo,rxbygo->rxsyg', cw_tm, fxc)
+
+        if deriv > 2:
+            lxc = xc_orig[4].reshape(2, nvar, 2, nvar, 2, nvar, 2, nvar, ngrids, nsg)
+            kxc[1,:,1,:,1] *= 4
+            kxc[1,:,1,:,0] *= 3
+            kxc[1,:,0,:,1] *= 3
+            kxc[0,:,1,:,1] *= 3
+            kxc[1,:,0,:,0] *= 2
+            kxc[0,:,1,:,0] *= 2
+            kxc[0,:,0,:,1] *= 2
+
+            kxc += cp.einsum('wbxcydzgo,wgo->bxcydzgo', lxc[1], s)
+            kxc = cp.einsum('rao,axbyczgo->rxbyczgo', c_tm, kxc)
+            kxc = cp.einsum('sbo,rxbyczgo->rxsyczgo', c_tm, kxc)
+            # kxc = cp.einsum('rao,sbo,axbyczgo->rxsyczgo', c_tm, c_tm,kxc)
+            kxc_eff += cp.einsum('tco,rxsyczgo->rxsytzg', cw_tm, kxc)
+
+    # exc in libxc is defined as Exc per particle. exc_eff calculated above is exc*rho.
+    # Divide exc_eff by rho so as to follow the convention of libxc
+    if rho_tm.ndim == 2:
+        rho_pure = rho_tm[0]
+    else:
+        rho_pure = rho_tm[0,0]
+
+    exc_eff[rho_pure == 0] = 0
+    exc_eff[rho_pure != 0] /= rho_pure[rho_pure != 0]
+
+    # Strongly spin-polarized points (rho ~= |m|) can be considered as collinear spins
+    if collinear_threshold is not None:
+        rho, s = _project_spin_paxis(rho_tm.reshape(4, nvar, ngrids)[:,0])
+        cs_idx = cp.where(s >= rho * collinear_threshold)[0]
+        if cs_idx.size > 0:
+            xc_cs = eval_xc_collinear_spin(func, rho_tm[...,cs_idx], deriv,
+                                           collinear_samples)
+            print("debug 1")
+            print("exc_eff", exc_eff.shape)
+            print("cs_idx", cs_idx.shape)
+            print("xc_cs", len(xc_cs))
+            print("xc_cs[0]", xc_cs[0].shape)
+            print("rho_tm", rho_tm.shape)
+            exc_eff[...,cs_idx] = xc_cs[0]
+            if deriv > 0:
+                vxc_eff[...,cs_idx] = xc_cs[1]
+            if deriv > 1:
+                fxc_eff[...,cs_idx] = xc_cs[2]
+            if deriv > 2:
+                kxc_eff[...,cs_idx] = xc_cs[3]
+
+    ret = [exc_eff]
+    if deriv > 0:
+        ret.append(vxc_eff)
+    if deriv > 1:
+        ret.append(fxc_eff)
+    if deriv > 2:
+        ret.append(kxc_eff)
+    if deriv > 3:
+        raise NotImplementedError
+    return ret
+
+def _make_sph_samples(spin_samples):
+    '''Integration samples on spherical surface'''
+    ang_grids = MakeAngularGrid(spin_samples)
+    directions = ang_grids[:,:3].copy(order='F')
+    weights = ang_grids[:,3].copy()
+    return directions, weights
+
+def _prange(start, end, step):
+    '''Partitions range into segments: i0:i1, i1:i2, i2:i3, ...'''
+    if start < end:
+        for i in range(start, end, step):
+            yield i, min(i+step, end)
+
+def _project_spin_sph(rho_tm, sgrids):
+    '''Projects spin onto spherical surface'''
+    rho = rho_tm[0]
+    m = rho_tm[1:]
+    nsg = sgrids.shape[0]
+    ngrids = rho.shape[-1]
+    if rho_tm.ndim == 2:
+        rho_ts = cp.empty((2, ngrids, nsg))
+        rho_ts[0] = rho[:, cp.newaxis]
+        rho_ts[1] = cp.einsum('mg,om->go', m, sgrids)
+        rho_ts = rho_ts.reshape(2, ngrids*nsg)
+    else:
+        nvar = rho_tm.shape[1]
+        rho_ts = cp.empty((2, nvar, ngrids, nsg))
+        rho_ts[0] = rho[:, :, cp.newaxis]
+        rho_ts[1] = cp.einsum('mxg,om->xgo', m, sgrids)
+        rho_ts = rho_ts.reshape(2, nvar, ngrids*nsg)
+    return rho_ts
+
+def _ts2tm_transformation(sgrids):
+    '''
+    Transformation that projects v_ts(rho,s) to rho/m representation (rho,mx,my,mz)
+    '''
+    nsg = sgrids.shape[0]
+    c_tm = cp.zeros((4, 2, nsg))
+    c_tm[0,0] = 1
+    c_tm[1:,1] = sgrids.T
+    return c_tm
+
+def _make_paxis_samples(spin_samples):
+    '''Samples on principal axis between [0, 1]'''
+    rt, wt = np.polynomial.legendre.leggauss(spin_samples)
+    rt = cp.asarray(rt)
+    wt = cp.asarray(wt)
+    rt = rt * .5 + .5
+    wt *= .5  # normalized to 1
+    return rt, wt
+
+def _project_spin_paxis(rho_tm, sgridz=None):
+    '''Projects spins onto the principal axis'''
+    rho = rho_tm[0]
+    m = rho_tm[1:]
+
+    s = cp.linalg.norm(m, axis=0)
+    if sgridz is None:
+        rho_ts = cp.stack([rho, s])
+    else:
+        ngrids = rho.shape[-1]
+        nsg = sgridz.shape[0]
+        if rho_tm.ndim == 2:
+            rho_ts = cp.empty((2, ngrids, nsg))
+            rho_ts[0] = rho[:,cp.newaxis]
+            rho_ts[1] = s[:,cp.newaxis] * sgridz
+            rho_ts = rho_ts.reshape(2, ngrids * nsg)
+        else:
+            print('222')
+            nvar = rho_tm.shape[1]
+            rho_ts = cp.empty((2, nvar, ngrids, nsg))
+            rho_ts[0] = rho[:,:,cp.newaxis]
+            rho_ts[1] = s[:,:,cp.newaxis] * sgridz
+            rho_ts = rho_ts.reshape(2, nvar, ngrids * nsg)
+    return rho_ts
+
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 55c25dfab..81e1be013 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -2429,6 +2429,74 @@ def unsort_orbitals(self, sorted_mat, axis=[], out=None):
         out[tuple(fancy_index)] = sorted_mat
         return out
 
+    def sort_orbitals_2c(self, mat, axis=[]):
+        ''' Transform given axis of a 2-component matrix (GKS) into sorted AO
+        
+        This assumes the axes specified in 'axis' have a dimension of 2*nao,
+        representing alpha (:nao) and beta (nao:) components. Both components
+        are sorted using the same AO sorting index.
+        '''
+        idx = self._ao_idx
+        nao = len(idx)
+        
+        # Create the 2-component sorting index:
+        # [sorted_alpha_indices, sorted_beta_indices]
+        # E.g., if idx = [0, 2, 1] (nao=3),
+        # idx_2c = [0, 2, 1, 3+0, 3+2, 3+1] = [0, 2, 1, 3, 5, 4]
+        # We use numpy to build the index, consistent with self._ao_idx
+        idx_2c = np.concatenate([idx, idx + nao])
+
+        shape_ones = (1,) * mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(mat.shape):
+            if dim in axis:
+                # Check if the dimension matches the 2-component size
+                if n != 2 * nao:
+                    raise ValueError(f"Axis {dim} has dimension {n}, expected {2*nao} for 2-component sorting")
+                indices = idx_2c
+            else:
+                # Use cupy.arange for non-sorted axes, as in the original sort_orbitals
+                indices = cupy.arange(n)
+            
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+            
+        # Perform the sorting using advanced indexing
+        return mat[tuple(fancy_index)]
+
+    def unsort_orbitals_2c(self, sorted_mat, axis=[], out=None):
+        ''' Transform given axis of a 2-component matrix from sorted AO to original AO
+        
+        This assumes the axes specified in 'axis' have a dimension of 2*nao.
+        This is the inverse operation of sort_orbitals_2c.
+        '''
+        idx = self._ao_idx
+        nao = len(idx)
+        
+        # The 2-component index is created identically to sort_orbitals_2c
+        idx_2c = np.concatenate([idx, idx + nao])
+
+        shape_ones = (1,) * sorted_mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(sorted_mat.shape):
+            if dim in axis:
+                # Check if the dimension matches the 2-component size
+                if n != 2 * nao:
+                    raise ValueError(f"Axis {dim} has dimension {n}, expected {2*nao} for 2-component unsorting")
+                indices = idx_2c
+            else:
+                indices = cupy.arange(n)
+            
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+            
+        if out is None:
+            out = cupy.empty_like(sorted_mat)
+            
+        # Perform the unsorting assignment
+        out[tuple(fancy_index)] = sorted_mat
+        return out
+
 class GTOValEnvVars(ctypes.Structure):
     _fields_ = [
         ("natm", ctypes.c_int),
diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
new file mode 100644
index 000000000..cdc78e087
--- /dev/null
+++ b/gpu4pyscf/dft/numint2c.py
@@ -0,0 +1,499 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Numerical integration functions for (2-component) GKS with real AO basis
+'''
+
+import functools
+import numpy as np
+import cupy as cp
+from pyscf import lib
+from pyscf.dft import numint2c
+from gpu4pyscf.dft import numint, mcfun_gpu
+from gpu4pyscf.dft.numint import _dot_ao_dm, _dot_ao_ao, _scale_ao
+from gpu4pyscf.dft import xc_deriv
+from gpu4pyscf.lib import utils
+from pyscf import __config__
+
+
+def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
+             with_lapl=True, verbose=None):
+    nao = ao.shape[-2]
+    assert dm.ndim == 2 and nao * 2 == dm.shape[0]
+    if not isinstance(dm, cp.ndarray):
+        dm = cp.asarray(dm)
+
+    nao, ngrids = ao.shape[-2:]
+    xctype = xctype.upper()
+    shls_slice = (0, mol.nbas)
+    ao_loc = mol.ao_loc
+
+    if xctype == 'LDA':
+        c0a = _dot_ao_dm(mol, ao, dm[:nao], non0tab, shls_slice, ao_loc)
+        c0b = _dot_ao_dm(mol, ao, dm[nao:], non0tab, shls_slice, ao_loc)
+        rho_m = _contract_rho_m((ao, ao), (c0a, c0b), hermi, True)
+    elif xctype == 'GGA':
+        # first 4 ~ (rho, m), second 4 ~ (0th order, dx, dy, dz)
+        if hermi:
+            rho_m = cp.empty((4, 4, ngrids))
+        else:
+            rho_m = cp.empty((4, 4, ngrids), dtype=cp.complex128)
+        c0a = _dot_ao_dm(mol, ao[0], dm[:nao], non0tab, shls_slice, ao_loc)
+        c0b = _dot_ao_dm(mol, ao[0], dm[nao:], non0tab, shls_slice, ao_loc)
+        c0 = (c0a, c0b)
+        rho_m[:,0] = _contract_rho_m((ao[0], ao[0]), c0, hermi, True)
+        for i in range(1, 4):
+            rho_m[:,i] = _contract_rho_m((ao[i], ao[i]), c0, hermi, False)
+        if hermi:
+            rho_m[:,1:4] *= 2  # *2 for |ao> dm < dx ao| + |dx ao> dm < ao|
+        else:
+            for i in range(1, 4):
+                c1a = _dot_ao_dm(mol, ao[i], dm[:nao], non0tab, shls_slice, ao_loc)
+                c1b = _dot_ao_dm(mol, ao[i], dm[nao:], non0tab, shls_slice, ao_loc)
+                rho_m[:,i] += _contract_rho_m((ao[0], ao[0]), (c1a, c1b), hermi, False)
+    else: # meta-GGA
+        if hermi:
+            dtype = cp.double
+        else:
+            dtype = cp.complex128
+        if with_lapl:
+            rho_m = cp.empty((4, 6, ngrids), dtype=dtype)
+            tau_idx = 5
+        else:
+            rho_m = cp.empty((4, 5, ngrids), dtype=dtype)
+            tau_idx = 4
+        c0a = _dot_ao_dm(mol, ao[0], dm[:nao], non0tab, shls_slice, ao_loc)
+        c0b = _dot_ao_dm(mol, ao[0], dm[nao:], non0tab, shls_slice, ao_loc)
+        c0 = (c0a, c0b)
+        rho_m[:,0] = _contract_rho_m((ao[0], ao[0]), c0, hermi, True)
+        rho_m[:,tau_idx] = 0
+        for i in range(1, 4):
+            c1a = _dot_ao_dm(mol, ao[i], dm[:nao], non0tab, shls_slice, ao_loc)
+            c1b = _dot_ao_dm(mol, ao[i], dm[nao:], non0tab, shls_slice, ao_loc)
+            rho_m[:,tau_idx] += _contract_rho_m((ao[i], ao[i]), (c1a, c1b), hermi, True)
+
+            rho_m[:,i] = _contract_rho_m((ao[i], ao[i]), c0, hermi, False)
+            if hermi:
+                rho_m[:,i] *= 2
+            else:
+                rho_m[:,i] += _contract_rho_m((ao[0], ao[0]), (c1a, c1b), hermi, False)
+        if with_lapl:
+            # TODO: rho_m[:,4] = \nabla^2 rho
+            raise NotImplementedError
+        # tau = 1/2 (\nabla f)^2
+        rho_m[:,tau_idx] *= .5
+    return rho_m
+
+def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
+                  max_memory=2000, verbose=None):
+    xctype = ni._xc_type(xc_code)
+    shls_slice = (0, mol.nbas)
+    ao_loc = mol.ao_loc_nr()
+    n2c = dms[0].shape[-1]
+    nao = n2c // 2
+    opt = ni.gdftopt
+    _sorted_mol = opt._sorted_mol
+
+    nelec = 0
+    excsum = 0
+    vmat = cp.zeros((n2c,n2c), dtype=cp.complex128)
+
+    if xctype in ('LDA', 'GGA', 'MGGA'):
+        f_eval_mat = {
+            ('LDA' , 'n'): (_ncol_lda_vxc_mat , 0),
+            ('LDA' , 'm'): (_mcol_lda_vxc_mat , 0),
+            ('GGA' , 'm'): (_mcol_gga_vxc_mat , 1),
+            ('MGGA', 'm'): (_mcol_mgga_vxc_mat, 1),
+        }
+        fmat, ao_deriv = f_eval_mat[(xctype, ni.collinear[0])]
+
+        if ni.collinear[0] == 'm':  # mcol
+            eval_xc = ni.mcfun_eval_xc_adapter(xc_code)
+        else:
+            raise NotImplementedError('locally-collinear vxc is not implemented')
+
+        for ao, mask, weight, coords \
+                in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory):
+            mask_2c = np.concatenate([mask, mask + nao])
+            dm_mask = dms[mask_2c[:,None],mask_2c]
+            rho = eval_rho(_sorted_mol, ao, dm_mask, non0tab=None, xctype=xctype, hermi=hermi,
+                    with_lapl=False, verbose=None)
+            exc, vxc = eval_xc(xc_code, rho, deriv=1, xctype=xctype)[:2]
+            if xctype == 'LDA':
+                den = rho[0] * weight
+            else:
+                den = rho[0,0] * weight
+            nelec += den.sum()
+            excsum += cp.dot(den, exc)
+            vmat += fmat(mol, ao, weight, rho, vxc, mask, shls_slice,
+                            ao_loc, hermi)
+
+    elif xctype == 'HF':
+        pass
+    else:
+        raise NotImplementedError(f'numint2c.get_vxc for functional {xc_code}')
+
+    if hermi:
+        vmat = vmat + vmat.conj().transpose(1,0)
+
+    return nelec, excsum, vmat
+
+def _gks_mcol_fxc(ni, mol, grids, xc_code, dm0, dms, relativity=0, hermi=0,
+                  rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
+    raise NotImplementedError('non-collinear lda fxc')
+
+def _ncol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
+    '''Vxc matrix of non-collinear LDA'''
+    # NOTE vxc in u/d representation
+    raise NotImplementedError('non-collinear lda vxc mat')
+
+
+# * Mcfun requires functional derivatives to total-density and spin-density.
+# * Make it a global function than a closure so as to be callable by multiprocessing
+def __mcfun_fn_eval_xc(ni, xc_code, xctype, rho, deriv):
+    t, s = rho
+    if not isinstance(t, cp.ndarray):
+        t = cp.asarray(t)
+    if not isinstance(s, cp.ndarray):
+        s = cp.asarray(s)
+    rho = cp.stack([(t + s) * .5, (t - s) * .5])
+    spin = 1
+    evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype, spin=spin)
+    evfk = list(evfk)
+    for order in range(1, deriv+1):
+        if evfk[order] is not None:
+            evfk[order] = xc_deriv.ud2ts(evfk[order])
+    return evfk
+
+def mcfun_eval_xc_adapter(ni, xc_code):
+    '''Wrapper to generate the eval_xc function required by mcfun'''
+
+    xctype = ni._xc_type(xc_code)
+    fn_eval_xc = functools.partial(__mcfun_fn_eval_xc, ni, xc_code, xctype)
+    def eval_xc_eff(xc_code, rho, deriv=1, omega=None, xctype=None,
+                    verbose=None, spin=None):
+        return mcfun_gpu.eval_xc_eff(
+            fn_eval_xc, rho, deriv, spin_samples=ni.spin_samples,
+            collinear_threshold=ni.collinear_thrd,
+            collinear_samples=ni.collinear_samples)
+    return eval_xc_eff
+
+def _mcol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
+    '''Vxc matrix of multi-collinear LDA'''
+    wv = weight * vxc
+    if hermi:
+        wv *= .5  # * .5 because of v+v.conj().T in r_vxc
+    wr, wmx, wmy, wmz = wv
+
+    # einsum('g,g,xgi,xgj->ij', vxc, weight, ao, ao)
+    # + einsum('xy,g,g,xgi,ygj->ij', sx, vxc, weight, ao, ao)
+    # + einsum('xy,g,g,xgi,ygj->ij', sy, vxc, weight, ao, ao)
+    # + einsum('xy,g,g,xgi,ygj->ij', sz, vxc, weight, ao, ao)
+    aow = None
+    aow = _scale_ao(ao, wmx[0], out=aow)  # Mx
+    tmpx = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao, wmy[0], out=aow)  # My
+    tmpy = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    if hermi:
+        # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
+        matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
+        matab = cp.zeros_like(matba)
+    else:
+        # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
+        matba = tmpx + tmpy * 1j
+        matab = tmpx - tmpy * 1j
+    tmpx = tmpy = None
+    aow = _scale_ao(ao, wr[0]+wmz[0], out=aow)  # Mz
+    mataa = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao, wr[0]-wmz[0], out=aow)  # Mz
+    matbb = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    row1 = cp.concatenate([mataa, matab], axis=1)
+    row2 = cp.concatenate([matba, matbb], axis=1)
+
+    mat = cp.concatenate([row1, row2], axis=0)
+    return mat
+
+def _mcol_gga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
+    '''Vxc matrix of multi-collinear LDA'''
+    wv = weight * vxc
+    if hermi:
+        wv[:,0] *= .5  # * .5 because of v+v.conj().T in r_vxc
+    wr, wmx, wmy, wmz = wv
+
+    aow = None
+    aow = _scale_ao(ao[:4], wr[:4]+wmz[:4], out=aow)  # Mz
+    mataa = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao[:4], wr[:4]-wmz[:4], out=aow)  # Mz
+    matbb = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao[:4], wmx[:4], out=aow)  # Mx
+    tmpx = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao[:4], wmy[:4], out=aow)  # My
+    tmpy = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    if hermi:
+        assert vxc.dtype == cp.double
+        # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
+        matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
+        matab = cp.zeros_like(matba)
+    else:
+        # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
+        aow = _scale_ao(ao[1:4], wmx[1:4].conj(), out=aow)  # Mx
+        tmpx += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        aow = _scale_ao(ao[1:4], wmy[1:4].conj(), out=aow)  # My
+        tmpy += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        matba = tmpx + tmpy * 1j
+        matab = tmpx - tmpy * 1j
+        aow = _scale_ao(ao[1:4], (wr[1:4]+wmz[1:4]).conj(), out=aow)  # Mz
+        mataa += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        aow = _scale_ao(ao[1:4], (wr[1:4]-wmz[1:4]).conj(), out=aow)  # Mz
+        matbb += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+
+    row1 = cp.concatenate([mataa, matab], axis=1)
+    row2 = cp.concatenate([matba, matbb], axis=1)
+
+    mat = cp.concatenate([row1, row2], axis=0)
+    return mat
+
+def _tau_dot(mol, bra, ket, wv, mask, shls_slice, ao_loc):
+    '''nabla_ao dot nabla_ao
+    numpy.einsum('p,xpi,xpj->ij', wv, bra[1:4].conj(), ket[1:4])
+    '''
+    aow = _scale_ao(ket[1], wv)
+    mat = _dot_ao_ao(mol, bra[1], aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ket[2], wv, aow)
+    mat += _dot_ao_ao(mol, bra[2], aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ket[3], wv, aow)
+    mat += _dot_ao_ao(mol, bra[3], aow, mask, shls_slice, ao_loc)
+    return mat
+
+def _mcol_mgga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
+    '''Vxc matrix of multi-collinear MGGA'''
+    wv = weight * vxc
+    tau_idx = 4
+    wv[:,tau_idx] *= .5  # *.5 for 1/2 in tau
+    if hermi:
+        wv[:,0] *= .5  # * .5 because of v+v.conj().T in r_vxc
+        wv[:,tau_idx] *= .5
+    wr, wmx, wmy, wmz = wv
+
+    aow = None
+    aow = _scale_ao(ao[:4], wr[:4]+wmz[:4], out=aow)  # Mz
+    mataa = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    mataa += _tau_dot(mol, ao, ao, wr[tau_idx]+wmz[tau_idx], mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao[:4], wr[:4]-wmz[:4], out=aow)  # Mz
+    matbb = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    matbb += _tau_dot(mol, ao, ao, wr[tau_idx]-wmz[tau_idx], mask, shls_slice, ao_loc)
+
+    aow = _scale_ao(ao[:4], wmx[:4], out=aow)  # Mx
+    tmpx = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    tmpx += _tau_dot(mol, ao, ao, wmx[tau_idx], mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao[:4], wmy[:4], out=aow)  # My
+    tmpy = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    tmpy += _tau_dot(mol, ao, ao, wmy[tau_idx], mask, shls_slice, ao_loc)
+    if hermi:
+        assert vxc.dtype == cp.double
+        # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
+        matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
+        matab = cp.zeros_like(matba)
+    else:
+        # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
+        aow = _scale_ao(ao[1:4], wmx[1:4].conj(), out=aow)  # Mx
+        tmpx += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        aow = _scale_ao(ao[1:4], wmy[1:4].conj(), out=aow)  # My
+        tmpy += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        matba = tmpx + tmpy * 1j
+        matab = tmpx - tmpy * 1j
+        aow = _scale_ao(ao[1:4], (wr[1:4]+wmz[1:4]).conj(), out=aow)  # Mz
+        mataa += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        aow = _scale_ao(ao[1:4], (wr[1:4]-wmz[1:4]).conj(), out=aow)  # Mz
+        matbb += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+
+    row1 = cp.concatenate([mataa, matab], axis=1)
+    row2 = cp.concatenate([matba, matbb], axis=1)
+
+    mat = cp.concatenate([row1, row2], axis=0)
+    return mat
+
+def _mcol_lda_fxc_mat(mol, ao, weight, rho0, rho1, fxc,
+                      mask, shls_slice, ao_loc, hermi):
+    raise NotImplementedError('non-collinear lda fxc')
+
+def _mcol_gga_fxc_mat(mol, ao, weight, rho0, rho1, fxc,
+                      mask, shls_slice, ao_loc, hermi):
+    raise NotImplementedError('non-collinear gga fxc')
+
+def _mcol_mgga_fxc_mat(mol, ao, weight, rho0, rho1, fxc,
+                       mask, shls_slice, ao_loc, hermi):
+    raise NotImplementedError('non-collinear mgga fxc')
+
+def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
+    '''
+    hermi indicates whether the density matrix is hermitian.
+    bra_eq_ket indicates whether bra and ket basis are the same AOs.
+    '''
+    # rho = einsum('xgi,ij,xgj->g', ket, dm, bra.conj())
+    # mx = einsum('xy,ygi,ij,xgj->g', sx, ket, dm, bra.conj())
+    # my = einsum('xy,ygi,ij,xgj->g', sy, ket, dm, bra.conj())
+    # mz = einsum('xy,ygi,ij,xgj->g', sz, ket, dm, bra.conj())
+    ket_a, ket_b = ket
+    bra_a, bra_b = bra
+    nao = min(ket_a.shape[-2], bra_a.shape[-2])
+    ngrids = ket_a.shape[-1]
+    if hermi:
+        raa = cp.einsum('ip,ip->p', bra_a.real, ket_a[:nao].real)
+        raa+= cp.einsum('ip,ip->p', bra_a.imag, ket_a[:nao].imag)
+        rab = cp.einsum('ip,ip->p', bra_a.conj(), ket_b[:nao])
+        rbb = cp.einsum('ip,ip->p', bra_b.real, ket_b[nao:].real)
+        rbb+= cp.einsum('ip,ip->p', bra_b.imag, ket_b[nao:].imag)
+        rho_m = cp.empty((4, ngrids))
+        rho_m[0,:] = raa + rbb     # rho
+        rho_m[1,:] = rab.real      # mx
+        rho_m[2,:] = rab.imag      # my
+        rho_m[3,:] = raa - rbb     # mz
+        if bra_eq_ket:
+            rho_m[1,:] *= 2
+            rho_m[2,:] *= 2
+        else:
+            rba = cp.einsum('ip,ip->p', bra_b.conj(), ket_a[nao:])
+            rho_m[1,:] += rba.real
+            rho_m[2,:] -= rba.imag
+    else:
+        raa = cp.einsum('ip,ip->p', bra_a.conj(), ket_a[:nao])
+        rba = cp.einsum('ip,ip->p', bra_b.conj(), ket_a[nao:])
+        rab = cp.einsum('ip,ip->p', bra_a.conj(), ket_b[:nao])
+        rbb = cp.einsum('ip,ip->p', bra_b.conj(), ket_b[nao:])
+        rho_m = cp.empty((4, ngrids), dtype=cp.complex128)
+        rho_m[0,:] = raa + rbb         # rho
+        rho_m[1,:] = rab + rba         # mx
+        rho_m[2,:] = (rba - rab) * 1j  # my
+        rho_m[3,:] = raa - rbb         # mz
+    return rho_m
+
+
+class NumInt2C(lib.StreamObject, numint.LibXCMixin):
+    '''Numerical integration methods for 2-component basis (used by GKS)'''
+    _keys = {'gdftopt'}
+    to_gpu = utils.to_gpu
+    device = utils.device
+
+    gdftopt      = None
+
+    # collinear schemes:
+    #   'col' (collinear, by default)
+    #   'ncol' (non-collinear, also known as locally collinear)
+    #   'mcol' (multi-collinear)
+    collinear = getattr(__config__, 'dft_numint_RnumInt_collinear', 'col')
+    spin_samples = getattr(__config__, 'dft_numint_RnumInt_spin_samples', 770)
+    collinear_thrd = getattr(__config__, 'dft_numint_RnumInt_collinear_thrd', 0.99)
+    collinear_samples = getattr(__config__, 'dft_numint_RnumInt_collinear_samples', 200)
+
+    eval_ao = staticmethod(numint.eval_ao)
+    eval_rho = staticmethod(eval_rho)
+
+    def build(self, mol, coords):
+        self.gdftopt = numint._GDFTOpt.from_mol(mol)
+        self.grid_blksize = None
+        self.non0ao_idx = {}
+        return self
+
+    def eval_rho1(self, mol, ao, dm, screen_index=None, xctype='LDA', hermi=0,
+                  with_lapl=True, cutoff=None, ao_cutoff=None, pair_mask=None,
+                  verbose=None):
+        return self.eval_rho(mol, ao, dm, screen_index, xctype, hermi,
+                             with_lapl, verbose=verbose)
+
+    def eval_rho2(self, mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
+                  with_lapl=True, verbose=None):
+        '''Calculate the electron density for LDA functional and the density
+        derivatives for GGA functional in the framework of 2-component basis.
+        '''
+        if not isinstance(mo_occ, cp.ndarray):
+            mo_occ = cp.asarray(mo_occ)
+        if not isinstance(mo_coeff, cp.ndarray):
+            mo_coeff = cp.asarray(mo_coeff)
+        if self.collinear[0] in ('n', 'm'):
+            # TODO:
+            dm = cp.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
+            hermi = 1
+            rho = self.eval_rho(mol, ao, dm, non0tab, xctype, hermi, with_lapl, verbose)
+            return rho
+
+        raise NotImplementedError(self.collinear)
+
+    def cache_xc_kernel(self, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
+                        max_memory=2000):
+        '''Compute the 0th order density, Vxc and fxc.  They can be used in TDDFT,
+        DFT hessian module etc.
+        '''
+        raise NotImplementedError("Kxc calculation is not supported.")
+
+    def get_rho(self, mol, dm, grids, max_memory=2000):
+        '''Density in real space
+        '''
+        nao = dm.shape[-1] // 2
+        dm_a = dm[:nao,:nao].real
+        dm_b = dm[nao:,nao:].real
+        ni = self._to_numint1c()
+        return ni.get_rho(mol, dm_a+dm_b, grids, max_memory)
+
+    _gks_mcol_vxc = _gks_mcol_vxc
+    _gks_mcol_fxc = _gks_mcol_fxc
+
+    @lib.with_doc(numint.nr_rks.__doc__)
+    def nr_vxc(self, mol, grids, xc_code, dms, relativity=0, hermi=1,
+               max_memory=2000, verbose=None):
+        if not isinstance(dms, cp.ndarray):
+            dms = cp.asarray(dms)
+        if self.collinear[0] in ('m',):  # mcol or ncol
+            opt = getattr(self, 'gdftopt', None)
+            if opt is None:
+                self.build(mol, grids.coords)
+                opt = self.gdftopt
+            assert dms.ndim == 2
+            dms = cp.asarray(dms)
+            dms = opt.sort_orbitals_2c(dms, axis=[0,1])
+            n, exc, vmat = self._gks_mcol_vxc(mol, grids, xc_code, dms, relativity,
+                                              hermi, max_memory, verbose)
+            vmat = opt.unsort_orbitals_2c(vmat, axis=[0,1])
+        else:
+            raise NotImplementedError("Locally collinear and collinear is not implemented")
+        return n.sum(), exc, vmat
+    get_vxc = nr_gks_vxc = nr_vxc
+
+    @lib.with_doc(numint.nr_nlc_vxc.__doc__)
+    def nr_nlc_vxc(self, mol, grids, xc_code, dm, spin=0, relativity=0, hermi=1,
+                   max_memory=2000, verbose=None):
+        raise NotImplementedError('non-collinear nlc vxc')
+
+    @lib.with_doc(numint.nr_rks_fxc.__doc__)
+    def nr_fxc(self, mol, grids, xc_code, dm0, dms, spin=0, relativity=0, hermi=0,
+               rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
+        raise NotImplementedError('non-collinear fxc')
+    get_fxc = nr_gks_fxc = nr_fxc
+    
+    def _init_xcfuns(self, xc_code, spin=0):
+        return numint._init_xcfuns(xc_code, spin)
+    eval_xc_eff = numint.eval_xc_eff
+    mcfun_eval_xc_adapter = mcfun_eval_xc_adapter
+
+    block_loop = numint.NumInt.block_loop
+    reset = numint.NumInt.reset
+
+    def _to_numint1c(self):
+        '''Converts to the associated class to handle collinear systems'''
+        return self.view(numint.NumInt)
+    
+    def to_cpu(self):
+        ni = numint2c.NumInt2C()
+        return ni
\ No newline at end of file
diff --git a/gpu4pyscf/dft/tests/test_gks.py b/gpu4pyscf/dft/tests/test_gks.py
new file mode 100644
index 000000000..74a999f93
--- /dev/null
+++ b/gpu4pyscf/dft/tests/test_gks.py
@@ -0,0 +1,147 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pyscf import gto
+from pyscf import lib
+from gpu4pyscf.dft import gks
+from pyscf.dft import gks as gks_cpu
+try:
+    import mcfun
+except ImportError:
+    mcfun = None
+
+
+def setUpModule():
+    global mol, mol1
+    mol = gto.Mole()
+    mol.atom = '''
+        O    0    0    0
+        H    0.   -0.757   0.587
+        H    0.   0.757    0.587'''
+    mol.spin = None
+    mol.basis = 'sto3g'
+    mol.output = '/dev/null'
+    mol.build()
+
+    mol1 = gto.M(
+        atom = '''
+            O    0    0    0
+            H    0.   -0.757   0.587
+            H    0.   0.757    0.587''',
+        charge = 1,
+        spin = 1,
+        basis = 'sto3g',
+        output = '/dev/null'
+        )
+
+
+def tearDownModule():
+    global mol, mol1
+    mol.stdout.close()
+    mol1.stdout.close()
+    del mol, mol1
+
+
+class KnownValues(unittest.TestCase):
+    def test_mcol_gks_lda(self):
+        
+        mf_gpu = gks.GKS(mol)
+        mf_gpu.xc = 'lda,'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 6
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -74.0600297733097, 6)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -26.421986983504258, 5)
+
+        mf_gpu = gks.GKS(mol1)
+        mf_gpu.xc = 'lda,vwn'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 50
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -74.3741809222222, 6)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.63368769213053, 5)
+
+    def test_mcol_gks_gga(self):
+
+        mf_gpu = gks.GKS(mol)
+        mf_gpu.xc = 'pbe'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 6
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -75.2256398121708, 6)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -26.81184613393452, 5)
+
+        mf_gpu = gks.GKS(mol1)
+        mf_gpu.xc = 'pbe'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 50
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -74.869954771937, 6) # pyscf result
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.92164954706164, 5) # pyscf result
+
+    def test_mcol_gks_hyb(self):
+        mf_gpu = gks.GKS(mol)
+        mf_gpu.xc = 'b3lyp'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 6
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -75.312587317089, 6) # pyscf result
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.2469582128507, 5) # pyscf result
+
+        mf_gpu = gks.GKS(mol1)
+        mf_gpu.xc = 'b3lyp'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 50
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -74.9528036305753, 6) # pyscf result
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -28.49145406025193, 5) # pyscf result
+
+    def test_mcol_gks_mgga(self):
+        mf_gpu = gks.GKS(mol)
+        mf_gpu.xc = 'm06l'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 6
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -75.3053691716776, 6) # pyscf result
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.03099891671804, 5) # pyscf result
+
+        mf_gpu = gks.GKS(mol1)
+        mf_gpu.xc = 'm06l'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 50
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -74.9468853267496, 6) # pyscf result
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -28.188215296679516, 5) # pyscf result
+
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
+    def test_to_cpu(self):
+        mf_gpu = gks.GKS(mol1)
+        mf_gpu.xc = 'lda,vwn'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 50
+        eks4_gpu = mf_gpu.kernel()
+
+        mf_cpu = mf_gpu.to_cpu()
+        eks4_cpu = mf_cpu.kernel()
+
+        self.assertAlmostEqual(eks4_gpu, eks4_cpu, 6)
+        self.assertAlmostEqual(eks4_gpu, -74.3741809222222, 6)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.63368769213053, 5)
+
+
+if __name__ == "__main__":
+    print("Test GKS")
+    unittest.main()
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
new file mode 100644
index 000000000..4eb5ff678
--- /dev/null
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -0,0 +1,392 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import pyscf
+import cupy
+from pyscf import lib, scf
+from pyscf.dft.numint2c import NumInt2C as pyscf_numint2c
+from pyscf.dft.numint import NumInt as pyscf_numint
+from gpu4pyscf.dft import Grids
+from gpu4pyscf.dft import numint2c
+from gpu4pyscf.dft import numint
+from pyscf.dft import numint2c as pyscf_numint2c_file
+from gpu4pyscf.dft.numint2c import NumInt2C
+from gpu4pyscf.dft.numint import NumInt
+from gpu4pyscf import dft
+from gpu4pyscf.dft import gen_grid
+try:
+    import mcfun
+except ImportError:
+    mcfun = None
+
+def setUpModule():
+    global mol, grids_cpu, grids_gpu, dm, dm0, dm1, mo_occ, mo_coeff
+    mol = pyscf.M(
+        atom = '''
+O        0.000000    0.000000    0.117790
+H        0.000000    0.755453   -0.471161
+H        0.000000   -0.755453   -0.471161''',
+        basis = 'ccpvdz',
+        charge = 1,
+        spin = 1,  # = 2S = spin_up - spin_down
+        output = '/dev/null'
+        )
+
+    np.random.seed(2)
+    mf = scf.GHF(mol)
+    mf.kernel()
+    dm1 = mf.make_rdm1().copy()
+    dm = dm1
+    mo_coeff = mf.mo_coeff
+    mo_occ = mf.mo_occ
+    dm0 = (mo_coeff*mo_occ).dot(mo_coeff.T)
+
+    grids_gpu = Grids(mol)
+    grids_gpu.level = 1
+    grids_gpu.build()
+
+    grids_cpu = grids_gpu.to_cpu()
+    grids_cpu.weights = cupy.asnumpy(grids_gpu.weights)
+    grids_cpu.coords = cupy.asnumpy(grids_gpu.coords)
+
+def tearDownModule():
+    global mol, grids_cpu, grids_gpu
+    mol.stdout.close()
+    del mol, grids_cpu, grids_gpu
+
+LDA = 'LDA_C_VWN'
+GGA_PBE = 'GGA_C_PBE'
+MGGA_M06 = 'MGGA_C_M06'
+
+class KnownValues(unittest.TestCase):
+
+    def test_eval_rho(self):
+        np.random.seed(1)
+        dm = np.random.random(dm0.shape) + np.random.random(dm0.shape)*1.0j
+        dm = dm + dm.conj().T
+        ni_gpu = NumInt2C()
+        ni_cpu = pyscf_numint2c()
+        for xctype in ('LDA', 'GGA', 'MGGA'):
+            deriv = 1
+            if xctype == 'LDA':
+                deriv = 0
+            ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+            ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
+            
+            rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+            ref = ni_cpu.eval_rho(mol, ao_cpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
+            rho = ni_gpu.eval_rho(mol, ao_gpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+            ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
+    def test_eval_rho2(self):
+        np.random.seed(1)
+        mo_coeff_test = np.random.random(mo_coeff.shape) + np.random.random(mo_coeff.shape)*1.0j
+        ni_gpu = NumInt2C()
+        ni_gpu.collinear='m'
+        ni_cpu = pyscf_numint2c()
+        ni_cpu.collinear='m'
+        for xctype in ('LDA', 'GGA', 'MGGA'):
+            deriv = 1
+            if xctype == 'LDA':
+                deriv = 0
+            ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+            ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
+            
+            rho = ni_gpu.eval_rho2(mol, ao_gpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
+            ref = ni_cpu.eval_rho2(mol, ao_cpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
+            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
+    def test_get_rho(self):
+        ni_gpu = NumInt2C()
+        ni_gpu.collinear='m'
+
+        np.random.seed(1)
+        ni_gpu_1c = NumInt()
+        dm_test = np.random.random(dm0.shape) + np.random.random(dm0.shape)*1.0j
+        dm_test = dm_test + dm_test.T.conj()
+
+        n2c = dm_test.shape[0]
+        nao = n2c//2
+        dm_1c_test = dm_test[:nao,:nao] + dm_test[nao:,nao:]
+        rho_gpu = ni_gpu.get_rho(mol, dm_test, grids_gpu)
+        rho_1c_gpu = ni_gpu_1c.get_rho(mol, dm_1c_test.real, grids_gpu)
+        self.assertAlmostEqual(abs(rho_gpu.get() - rho_1c_gpu.get()).max(), 0, 10)
+
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
+    def test_eval_xc_eff(self):
+        ni_gpu = NumInt2C()
+        ni_gpu.collinear='m'
+        ni_cpu = pyscf_numint2c()
+        ni_cpu.collinear='m'
+        np.random.seed(1)
+        dm = dm0*1.0 + dm0 * 0.1j
+        dm = dm + dm.T.conj()
+        for xc_code in (LDA, GGA_PBE, MGGA_M06):
+            n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, xc_code, dm)
+            n_cpu, exc_cpu, vmat_cpu = ni_cpu.nr_vxc(mol, grids_cpu, xc_code, dm)
+            self.assertAlmostEqual(abs(n_gpu.get() - n_cpu).max(), 0, 10)
+            self.assertAlmostEqual(abs(exc_gpu.get() - exc_cpu).max(), 0, 10)
+            self.assertAlmostEqual(abs(vmat_gpu.get() - vmat_cpu).max(), 0, 10)
+
+    def test_eval_xc_eff_fp(self):
+        ni_gpu = NumInt2C()
+        ni_gpu.collinear='m'
+        np.random.seed(1)
+        dm = dm0*1.0 + dm0 * 0.1j
+        dm = dm + dm.T.conj()
+
+        n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, LDA, dm)
+        self.assertAlmostEqual(abs(n_gpu.get() - 17.9999262659497).max(), 0, 10)
+        self.assertAlmostEqual(abs(exc_gpu.get() - -1.310501342423071).max(), 0, 10)
+        self.assertAlmostEqual(abs(lib.fp(vmat_gpu.get()) 
+            - (-0.20448306536588537-6.75460139752253e-21j)).max(), 0, 10)
+
+        n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, GGA_PBE, dm)
+        self.assertAlmostEqual(abs(n_gpu.get() - 17.9999262659497).max(), 0, 10)
+        self.assertAlmostEqual(abs(exc_gpu.get() - -0.7237150857425112).max(), 0, 10)
+        self.assertAlmostEqual(abs(lib.fp(vmat_gpu.get()) 
+            - (-0.05446425800187435-4.486282070082083e-21j)).max(), 0, 10)
+
+        n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, MGGA_M06, dm)
+        self.assertAlmostEqual(abs(n_gpu.get() - 17.9999262659497).max(), 0, 10)
+        self.assertAlmostEqual(abs(exc_gpu.get() - -0.7703982586705045).max(), 0, 10)
+        self.assertAlmostEqual(abs(lib.fp(vmat_gpu.get()) 
+            - (-0.18688247306409317+7.50400133342109e-20j)).max(), 0, 10)
+
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
+    def test_mcol_lda_vxc_mat(self):
+        xc_code = 'lda,'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .0001j
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=0, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='LDA', hermi=1, with_lapl=False)
+        ni_CPU = pyscf_numint2c()
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        ni_CPU.collinear = 'mcol'
+        eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='LDA')[1]
+        mask = np.ones((8, mol.nbas), dtype=np.uint8)
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_cpu = pyscf_numint2c_file._mcol_lda_vxc_mat(mol, ao.transpose(1,0).get(), weight, 
+            rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+        v0_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
+        self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
+
+    def test_mcol_lda_vxc_mat_fp(self):
+        xc_code = 'lda,'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .0001j
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=0, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='LDA', hermi=1, with_lapl=False)
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='LDA')[1]
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        self.assertAlmostEqual(abs(lib.fp(v0_gpu.get()) - 
+            (-9.596802282363786+0.000531850990220307j)).max(), 0, 13)
+        self.assertAlmostEqual(abs(lib.fp(v1_gpu.get()) - 
+            (-9.596802282363786+0.000531850990220307j)).max(), 0, 13)
+
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
+    def test_mcol_gga_vxc_mat(self):
+        xc_code = 'pbe,'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .001j
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='GGA', hermi=1, with_lapl=False)
+        ni_CPU = pyscf_numint2c()
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        ni_CPU.collinear = 'mcol'
+        eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='GGA')[1]
+        mask = np.ones((8, mol.nbas), dtype=np.uint8)
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_cpu = pyscf_numint2c_file._mcol_gga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, 
+            rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+        v0_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
+        self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
+
+    def test_mcol_gga_vxc_mat_fp(self):
+        xc_code = 'pbe,'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .001j
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='GGA', hermi=1, with_lapl=False)
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='GGA')[1]
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        self.assertAlmostEqual(abs(lib.fp(v1_gpu.get()) - 
+            (-9.624252836509262+0.005273283360578891j)).max(), 0, 13)
+        self.assertAlmostEqual(abs(lib.fp(v0_gpu.get()) - 
+            (-9.624252836509262+0.005273283360578891j)).max(), 0, 13)
+
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
+    def test_mcol_mgga_vxc_mat(self):
+        xc_code = 'tpss'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='MGGA', hermi=1, with_lapl=False)
+        ni_CPU = pyscf_numint2c()
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        ni_CPU.collinear = 'mcol'
+        eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='MGGA')[1]
+        mask = np.ones((8, mol.nbas), dtype=np.uint8)
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_cpu = pyscf_numint2c_file._mcol_mgga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, 
+            rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+        v0_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
+        self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
+
+    def test_mcol_mgga_vxc_mat_fp(self):
+        xc_code = 'tpss'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='MGGA', hermi=1, with_lapl=False)
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='MGGA')[1]
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        self.assertAlmostEqual(abs(lib.fp(v1_gpu.get()) 
+            - (-11.359687631112195+6.13828986953566e-22j)).max(), 0, 13)
+        self.assertAlmostEqual(abs(lib.fp(v0_gpu.get()) 
+            - (-11.359687631112195+6.13828986953566e-22j)).max(), 0, 13)
+
+
+if __name__ == "__main__":
+    print("Full Tests for dft numint2c")
+    unittest.main()
+
diff --git a/gpu4pyscf/scf/ghf.py b/gpu4pyscf/scf/ghf.py
index cd8336d7f..a2fcc5613 100644
--- a/gpu4pyscf/scf/ghf.py
+++ b/gpu4pyscf/scf/ghf.py
@@ -22,6 +22,15 @@
 from gpu4pyscf.lib.cupy_helper import asarray, return_cupy_array
 from gpu4pyscf.lib import utils
 
+def _from_rhf_init_dm(dma, breaksym=True):
+    dma = dma * .5
+    dm = block_diag(dma, dma)
+    if breaksym:
+        nao = dma.shape[0]
+        idx, idy = cp.diag_indices(nao)
+        dm[idx+nao,idy] = dm[idx,idy+nao] = dma.diagonal() * .05
+    return dm
+
 class GHF(hf.SCF):
     to_gpu = utils.to_gpu
     device = utils.device
@@ -55,7 +64,7 @@ class GHF(hf.SCF):
 
     def get_init_guess(self, mol=None, key='minao', **kwargs):
         dma = hf.RHF.get_init_guess(self, mol, key, **kwargs)
-        return block_diag(dma, dma)
+        return _from_rhf_init_dm(dma)
 
     def get_hcore(self, mol=None):
         if mol is None: mol = self.mol
@@ -85,6 +94,7 @@ def get_jk(self, mol=None, dm=None, hermi=0, with_j=True, with_k=True,
         return vj, vk
 
     def get_j(self, mol=None, dm=None, hermi=1, omega=None):
+        assert hermi == 1, 'hermi must be 1'
         dm = asarray(dm)
         dm_shape = dm.shape
         nso = dm.shape[-1]
@@ -92,8 +102,8 @@ def get_j(self, mol=None, dm=None, hermi=1, omega=None):
         dm = dm.reshape(-1,nso,nso)
         n_dm = dm.shape[0]
         dm = dm[:,:nao,:nao] + dm[:,nao:,nao:]
-        jtmp = hf.SCF.get_j(self, mol, dm, hermi, omega)
-        vj = cp.zeros((n_dm,nso,nso))
+        jtmp = hf.SCF.get_j(self, mol, dm.real, hermi, omega)
+        vj = cp.zeros((n_dm,nso,nso), dtype=dm.dtype)
         vj[:,:nao,:nao] = vj[:,nao:,nao:] = jtmp
         return vj.reshape(dm_shape)
 
@@ -108,14 +118,27 @@ def get_k(self, mol=None, dm=None, hermi=1, omega=None):
         dmbb = dm[:,nao:,nao:]
         dmab = dm[:,:nao,nao:]
         dmba = dm[:,nao:,:nao]
-        dm = cp.vstack((dmaa, dmbb, dmab, dmba))
-        ktmp = super().get_k(mol, dm, hermi=0, omega=omega)
-        ktmp = ktmp.reshape(4,n_dm,nao,nao)
-        vk = cp.zeros((n_dm,nso,nso), dm.dtype)
-        vk[:,:nao,:nao] = ktmp[0]
-        vk[:,nao:,nao:] = ktmp[1]
-        vk[:,:nao,nao:] = ktmp[2]
-        vk[:,nao:,:nao] = ktmp[3]
+        if dm.dtype == cp.complex128:
+            dm_real = cp.vstack((dmaa.real, dmbb.real, dmab.real, dmba.real))
+            ktmp_real = super().get_k(mol, dm_real, hermi=0, omega=omega)
+            ktmp_real = ktmp_real.reshape(4,n_dm,nao,nao)
+            dm_imag = cp.vstack((dmaa.imag, dmbb.imag, dmab.imag, dmba.imag))
+            ktmp_imag = super().get_k(mol, dm_imag, hermi=0, omega=omega)
+            ktmp_imag = ktmp_imag.reshape(4,n_dm,nao,nao)
+            vk = cp.zeros((n_dm,nso,nso), dm.dtype)
+            vk[:,:nao,:nao] = ktmp_real[0] + 1j*ktmp_imag[0]
+            vk[:,nao:,nao:] = ktmp_real[1] + 1j*ktmp_imag[1]
+            vk[:,:nao,nao:] = ktmp_real[2] + 1j*ktmp_imag[2]
+            vk[:,nao:,:nao] = ktmp_real[3] + 1j*ktmp_imag[3]
+        else:
+            dm = cp.vstack((dmaa, dmbb, dmab, dmba))
+            ktmp = super().get_k(mol, dm, hermi=0, omega=omega)
+            ktmp = ktmp.reshape(4,n_dm,nao,nao)
+            vk = cp.zeros((n_dm,nso,nso), dm.dtype)
+            vk[:,:nao,:nao] = ktmp[0]
+            vk[:,nao:,nao:] = ktmp[1]
+            vk[:,:nao,nao:] = ktmp[2]
+            vk[:,nao:,:nao] = ktmp[3]
         return vk.reshape(dm_shape)
 
     def get_veff(mf, mol=None, dm=None, dm_last=None, vhf_last=None, hermi=1):
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index f384f78dd..fbce65008 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -145,6 +145,8 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
     if dm is None: dm = mf.make_rdm1()
     s1e = cupy.asarray(s1e)
     dm = cupy.asarray(dm)
+    if dm.dtype == cupy.complex128:
+        s1e = s1e.astype(cupy.complex128)
     if diis_start_cycle is None:
         diis_start_cycle = mf.diis_start_cycle
     if level_shift_factor is None:
@@ -661,6 +663,8 @@ def eig(self, fock, s):
         x = None
         if hasattr(self, 'overlap_canonical_decomposed_x') and self.overlap_canonical_decomposed_x is not None:
             x = cupy.asarray(self.overlap_canonical_decomposed_x)
+        if fock.dtype == cupy.complex128:
+            s = s.astype(cupy.complex128)
         if x is None:
             mo_energy, mo_coeff = eigh(fock, s)
             return mo_energy, mo_coeff
diff --git a/gpu4pyscf/scf/tests/test_ghf.py b/gpu4pyscf/scf/tests/test_ghf.py
index 3bea9800a..d82b77e2e 100644
--- a/gpu4pyscf/scf/tests/test_ghf.py
+++ b/gpu4pyscf/scf/tests/test_ghf.py
@@ -14,21 +14,108 @@
 
 import unittest
 import pyscf
+import numpy
+import cupy as cp
+
+
+def setUpModule():
+    global mol, mol1
+    mol = pyscf.gto.Mole()
+    mol.atom = '''
+        O    0    0    0
+        H    0.   -0.757   0.587
+        H    0.   0.757    0.587'''
+    mol.spin = None
+    mol.basis = 'cc-pvdz'
+    mol.verbose = 7
+    mol.output = '/dev/null'
+    mol.build()
+
+    mol1 = pyscf.gto.M(
+        verbose = 0,
+        atom = '''
+            O    0    0    0
+            H    0.   -0.757   0.587
+            H    0.   0.757    0.587''',
+        charge = 1,
+        spin = 1,
+        basis = 'cc-pvdz',
+        output = '/dev/null')
+
+
+def tearDownModule():
+    global mol, mol1
+    mol.stdout.close()
+    mol1.stdout.close()
+    del mol, mol1
 
 class KnownValues(unittest.TestCase):
     def test_ghf_scf(self):
-        mol = pyscf.M(atom='''
-O     0    0        0
-H     0    -0.757   0.587
-H     0    0.757    0.587''', basis = 'cc-pvdz')
         mf = mol.GHF().to_gpu()
         assert mf.device == 'gpu'
         e_tot = mf.kernel()
         e_ref = mf.to_cpu().kernel()
         assert abs(e_tot - e_ref) < 1e-5
 
-    #def test_ghf_x2c(self):
-    #    pass
+    def test_ghf_scf_complex_dm(self):
+        mf = mol.GHF().to_gpu()
+        assert mf.device == 'gpu'
+        e_tot = mf.kernel()
+        dm1 = mf.make_rdm1()
+        numpy.random.seed(1)
+        n2c = mol.nao_nr() * 2
+        dm_perturb = numpy.random.random((n2c,n2c)) + 1j*numpy.random.random((n2c,n2c))
+        dm_perturb = dm_perturb + dm_perturb.T.conj()
+        dm_perturb = cp.asarray(dm_perturb)
+        dm_1 = dm1 + dm_perturb*0.001
+        e_ref = mf.to_cpu().kernel(dm_1.get())
+        e_tot = mf.kernel(dm_1)
+        assert abs(e_tot - e_ref) < 1e-5
+
+    def test_get_jk_complex(self):
+        mf = mol.GHF().to_gpu()
+        nao = mol.nao_nr()*2
+        numpy.random.seed(1)
+        d1 = numpy.random.random((nao,nao)) + 1j*numpy.random.random((nao,nao))
+        d = d1 + d1.T.conj()
+        d_real = d.real
+        vj_gpu = mf.get_j(mol, d_real)
+        vk_gpu = mf.get_k(mol, d)
+
+        mf_cpu = mf.to_cpu()
+        vj_cpu = mf_cpu.get_j(mol, d_real)
+        vk_cpu = mf_cpu.get_k(mol, d)
+
+        assert numpy.allclose(vj_gpu, vj_cpu)
+        assert numpy.allclose(vk_gpu, vk_cpu)
+
+    def test_get_jk_real(self):
+        mf = mol.GHF().to_gpu()
+
+        nao = mol.nao_nr()*2
+        numpy.random.seed(1)
+        d1 = numpy.random.random((nao,nao))
+        d = d1 + d1.T.conj()
+        d_real = d.real
+        vj_gpu = mf.get_j(mol, d_real)
+        vk_gpu = mf.get_k(mol, d)
+
+        mf_cpu = mf.to_cpu()
+        vj_cpu = mf_cpu.get_j(mol, d_real)
+        vk_cpu = mf_cpu.get_k(mol, d)
+
+        assert numpy.allclose(vj_gpu, vj_cpu)
+        assert numpy.allclose(vk_gpu, vk_cpu)
+
+    def test_to_cpu(self):
+        mf = mol.GHF().to_gpu()
+        assert mf.device == 'gpu'
+        e_tot = mf.kernel()
+        mf = mf.to_cpu()
+        assert getattr(mf, 'device', None) is None
+        e_ref = mf.kernel()
+        assert abs(e_tot - e_ref) < 1e-5
+
 
 if __name__ == "__main__":
     print("Full Tests for ghf")