diff --git a/.gitignore b/.gitignore
index 23fd24bf..c7f33141 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,138 +1,9 @@
-# Byte-compiled / optimized / DLL files
+_old
+*.pyc
 __pycache__/
-*.py[cod]
-*$py.class
+deim_outputs/under/
+deim_outputs/sides/
+*.ipynb
+*.png
+weight/
 
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# PyCharm
-.idea
-
-*.pt
-*.pth
-*.onnx
-*.zip
-*.html
-.DS_Store
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..72f5ad49
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,65 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Debug train.py (nproc=1 Simulation)",
+            "type": "python", // Use the Python debugger extension
+            "request": "launch",
+            "program": "${workspaceFolder}/train.py", // Path to your script
+            "console": "integratedTerminal", // Or "internalConsole"
+            "args": [ // Arguments normally passed after train.py
+                "-c",
+                "./deim_hgnetv2_n_coco.yml", // Replace ${model} or use specific file
+                "--use-amp",
+                "--seed=0",
+                "-t",
+                "./deim_dfine_hgnetv2_n_coco_160e.pth"
+                // Add any other args you need
+            ],
+            "env": { // Environment variables torchrun would set for nproc=1
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": "7778", // Match your torchrun port or choose another free one
+                "RANK": "0",
+                "WORLD_SIZE": "1",
+                "LOCAL_RANK": "0"
+                // Add PYTHONPATH if your imports require it and VS Code doesn't pick it up automatically
+                // "PYTHONPATH": "${workspaceFolder}/../:${env:PYTHONPATH}" // Example if train.py needs parent dir
+            },
+            "justMyCode": true // Set to false to step into library code
+        },
+        {
+            "name": "Python: Debug Script",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/tools/inference/torch_inf_super.py",
+            "args": [
+                "-c",
+                "deim_hgnetv2_n_coco.yml",
+                "-r",
+                "deim_outputs/deim_hgnetv2_n_coco/best_stg1_converted.pth",
+                "--input",
+                "66d8c56ea502fd4f902c330e_TC01_cupy.mp4",
+                "--device",
+                "cpu"
+            ],
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
+        {
+            "name": "Python: train.py",
+            "type": "python",
+            "request": "launch",
+            "program": "train.py",
+            "console": "integratedTerminal",
+            "args": [
+                "-c",
+                "deim_model_under.yml",
+                "--use-amp",
+                "--seed=0",
+                "-t",
+                "base_model.pth"
+            ],
+            "cwd": "${workspaceFolder}"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 00000000..2496217c
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,26 @@
+**Prompt:**
+Design a script that facilitates the creation of a new AI training module. The script should perform the following tasks:
+
+1. **Directory Structure**: 
+   - Create a clean directory structure for the new project, ensuring it includes all necessary folders for modules, data, and scripts.
+   - Copy required files and folders from the `_old` directory to the new project directory without altering the logic for training. do not edit any files in _old and do not import from old ever!
+
+2. **Module Creation**:
+   - In the root of the new project directory, create a new Python script that serves as the entry point for both training and inference.
+   - Ensure the new module can be imported easily and allows for training and inference to be executed in fewer than 10 lines of code.
+
+3. **Dependencies**:
+   - Utilize the `supervision` package for annotation purposes, ensuring users can easily add annotated files for training.
+   - Include instructions within the script for users to run the training and inference processes.
+
+4. **Environment Configuration**:
+   - Specify the use of the Python interpreter located at `/home/hidara/miniconda3/envs/deim/bin/python` throughout the project.
+   - Implement linting with `ruff` and type checking with `pyright` to ensure code quality.
+
+5. **User Guidance**:
+   - Add comments and documentation within the script to guide new users who may be unfamiliar with AI training processes, focusing on ease of use for adding new annotated files for training and transfer learning.
+
+6. **Execution**:
+   - Ensure that the script does not run any training code automatically due to the long-running nature of the training sessions on the GPU.
+
+By following these specifications, the resulting script will create a user-friendly environment for new users to engage with AI training and inference, making the process straightforward and accessible. 
\ No newline at end of file
diff --git a/DEIM.code-workspace b/DEIM.code-workspace
new file mode 100644
index 00000000..6a7cf15e
--- /dev/null
+++ b/DEIM.code-workspace
@@ -0,0 +1,11 @@
+{
+	"folders": [
+		{
+			"path": "."
+		},
+		{
+			"path": "../../datasets"
+		}
+	],
+	"settings": {}
+}
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 261eeb9e..00000000
--- a/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/README.md b/README.md
index 5d4d5ba5..e49ea2ac 100644
--- a/README.md
+++ b/README.md
@@ -1,448 +1,529 @@
-<h2 align="center">
-  DEIM: DETR with Improved Matching for Fast Convergence
-</h2>
-
-<p align="center">
-    <a href="https://github.com/ShihuaHuang95/DEIM/blob/master/LICENSE">
-        <img alt="license" src="https://img.shields.io/badge/LICENSE-Apache%202.0-blue">
-    </a>
-    <a href="https://arxiv.org/abs/2412.04234">
-        <img alt="arXiv" src="https://img.shields.io/badge/arXiv-2412.04234-red">
-    </a>
-   <a href="https://www.shihuahuang.cn/DEIM/">
-        <img alt="project webpage" src="https://img.shields.io/badge/Webpage-DEIM-purple">
-    </a>
-    <a href="https://github.com/ShihuaHuang95/DEIM/pulls">
-        <img alt="prs" src="https://img.shields.io/github/issues-pr/ShihuaHuang95/DEIM">
-    </a>
-    <a href="https://github.com/ShihuaHuang95/DEIM/issues">
-        <img alt="issues" src="https://img.shields.io/github/issues/ShihuaHuang95/DEIM?color=olive">
-    </a>
-    <a href="https://github.com/ShihuaHuang95/DEIM">
-        <img alt="stars" src="https://img.shields.io/github/stars/ShihuaHuang95/DEIM">
-    </a>
-    <a href="mailto:shihuahuang95@gmail.com">
-        <img alt="Contact Us" src="https://img.shields.io/badge/Contact-Email-yellow">
-    </a>
-</p>
-
-<p align="center">
-    DEIM is an advanced training framework designed to enhance the matching mechanism in DETRs, enabling faster convergence and improved accuracy. It serves as a robust foundation for future research and applications in the field of real-time object detection. 
-</p>
+# DEIM - DETR with Improved Matching
+
+> **High-Performance Object Detection for Thermal Imaging in Mining Environments**
+
+A production-ready Python module for thermal object detection, optimized for vehicle undercarriage and side monitoring in North Australian mining operations. Built on DEIM (DETR with Improved Matching) architecture with HGNetv2 backbone.
+
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
+[![PyTorch](https://img.shields.io/badge/PyTorch-2.0+-red.svg)](https://pytorch.org/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
 
 ---
 
+## 🎯 Features
 
-<div align="center">
-  <a href="http://www.shihuahuang.cn">Shihua Huang</a><sup>1</sup>,
-  <a href="https://scholar.google.com/citations?user=tIFWBcQAAAAJ&hl=en">Zhichao Lu</a><sup>2</sup>,
-  <a href="https://vinthony.github.io/academic/">Xiaodong Cun</a><sup>3</sup>,
-  Yongjun Yu<sup>1</sup>,
-  Xiao Zhou<sup>4</sup>, 
-  <a href="https://xishen0220.github.io">Xi Shen</a><sup>1*</sup>
-</div>
-
-  
-<p align="center">
-<i>
-1. Intellindust AI Lab &nbsp; 2. City University of Hong Kong &nbsp; 3. Great Bay University &nbsp; 4. Hefei Normal University
-</i>
-</p>
-
-<p align="center">
-  **📧 Corresponding author:** <a href="mailto:shenxiluc@gmail.com">shenxiluc@gmail.com</a>
-</p>
-
-<p align="center">
-    <a href="https://paperswithcode.com/sota/real-time-object-detection-on-coco?p=deim-detr-with-improved-matching-for-fast">
-    <img alt="sota" src="https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/deim-detr-with-improved-matching-for-fast/real-time-object-detection-on-coco">
-    </a>
-</p>
-
-<p align="center">
-<strong>If you like our work, please give us a ⭐!</strong>
-</p>
-
-
-<p align="center">
-  <img src="./figures/teaser_a.png" alt="Image 1" width="49%">
-  <img src="./figures/teaser_b.png" alt="Image 2" width="49%">
-</p>
-
-</details>
-
- 
-  
-## 🚀 Updates
-- [x] **\[2025.03.12\]** The Object365 Pretrained [DEIM-D-FINE-X](https://drive.google.com/file/d/1RMNrHh3bYN0FfT5ZlWhXtQxkG23xb2xj/view?usp=drive_link) model is released, which achieves 59.5% AP after fine-tuning 24 COCO epochs.
-- [x] **\[2025.03.05\]** The Nano DEIM model is released.
-- [x] **\[2025.02.27\]** The DEIM paper is accepted to CVPR 2025. Thanks to all co-authors.
-- [x] **\[2024.12.26\]** A more efficient implementation of Dense O2O, achieving nearly a 30% improvement in loading speed (See [the pull request](https://github.com/ShihuaHuang95/DEIM/pull/13) for more details). Huge thanks to my colleague [Longfei Liu](https://github.com/capsule2077).
-- [x] **\[2024.12.03\]** Release DEIM series. Besides, this repo also supports the re-implmentations of [D-FINE](https://arxiv.org/abs/2410.13842) and [RT-DETR](https://arxiv.org/abs/2407.17140).
-
-## Table of Content
-* [1. Model Zoo](https://github.com/ShihuaHuang95/DEIM?tab=readme-ov-file#1-model-zoo)
-* [2. Quick start](https://github.com/ShihuaHuang95/DEIM?tab=readme-ov-file#2-quick-start)
-* [3. Usage](https://github.com/ShihuaHuang95/DEIM?tab=readme-ov-file#3-usage)
-* [4. Tools](https://github.com/ShihuaHuang95/DEIM?tab=readme-ov-file#4-tools)
-* [5. Citation](https://github.com/ShihuaHuang95/DEIM?tab=readme-ov-file#5-citation)
-* [6. Acknowledgement](https://github.com/ShihuaHuang95/DEIM?tab=readme-ov-file#6-acknowledgement)
-  
-  
-## 1. Model Zoo
-
-### DEIM-D-FINE
-| Model | Dataset | AP<sup>D-FINE</sup> | AP<sup>DEIM</sup> | #Params | Latency | GFLOPs | config | checkpoint
-| :---: | :---: | :---: | :---: |  :---: | :---: | :---: | :---: | :---: 
-**N** | COCO | **42.8** | **43.0** | 4M | 2.12ms | 7 | [yml](./configs/deim_dfine/deim_hgnetv2_n_coco.yml) | [ckpt](https://drive.google.com/file/d/1ZPEhiU9nhW4M5jLnYOFwTSLQC1Ugf62e/view?usp=sharing) |
-**S** | COCO | **48.7** | **49.0** | 10M | 3.49ms | 25 | [yml](./configs/deim_dfine/deim_hgnetv2_s_coco.yml) | [ckpt](https://drive.google.com/file/d/1tB8gVJNrfb6dhFvoHJECKOF5VpkthhfC/view?usp=drive_link) |
-**M** | COCO | **52.3** | **52.7** | 19M | 5.62ms | 57 | [yml](./configs/deim_dfine/deim_hgnetv2_m_coco.yml) | [ckpt](https://drive.google.com/file/d/18Lj2a6UN6k_n_UzqnJyiaiLGpDzQQit8/view?usp=drive_link) |
-**L** | COCO | **54.0** | **54.7** | 31M | 8.07ms | 91 | [yml](./configs/deim_dfine/deim_hgnetv2_l_coco.yml) | [ckpt](https://drive.google.com/file/d/1PIRf02XkrA2xAD3wEiKE2FaamZgSGTAr/view?usp=drive_link) | 
-**X** | COCO | **55.8** | **56.5** | 62M | 12.89ms | 202 | [yml](./configs/deim_dfine/deim_hgnetv2_x_coco.yml) | [ckpt](https://drive.google.com/file/d/1dPtbgtGgq1Oa7k_LgH1GXPelg1IVeu0j/view?usp=drive_link) | 
-
-
-### DEIM-RT-DETRv2
-| Model | Dataset | AP<sup>RT-DETRv2</sup> | AP<sup>DEIM</sup> | #Params | Latency | GFLOPs | config | checkpoint
-| :---: | :---: | :---: | :---: |  :---: | :---: | :---: | :---: | :---: 
-**S** | COCO | **47.9** | **49.0** | 20M | 4.59ms | 60 | [yml](./configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml) | [ckpt](https://drive.google.com/file/d/153_JKff6EpFgiLKaqkJsoDcLal_0ux_F/view?usp=drive_link) | 
-**M** | COCO | **49.9** | **50.9** | 31M | 6.40ms | 92 | [yml](./configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml) | [ckpt](https://drive.google.com/file/d/1O9RjZF6kdFWGv1Etn1Toml4r-YfdMDMM/view?usp=drive_link) | 
-**M*** | COCO | **51.9** | **53.2** | 33M | 6.90ms | 100 | [yml](./configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml) | [ckpt](https://drive.google.com/file/d/10dLuqdBZ6H5ip9BbBiE6S7ZcmHkRbD0E/view?usp=drive_link) | 
-**L** | COCO | **53.4** | **54.3** | 42M | 9.15ms | 136 | [yml](./configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml) | [ckpt](https://drive.google.com/file/d/1mWknAXD5JYknUQ94WCEvPfXz13jcNOTI/view?usp=drive_link) | 
-**X** | COCO | **54.3** | **55.5** | 76M | 13.66ms | 259 | [yml](./configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml) | [ckpt](https://drive.google.com/file/d/1BIevZijOcBO17llTyDX32F_pYppBfnzu/view?usp=drive_link) | 
-
-
-## 2. Quick start
-
-### Setup
-
-```shell
-conda create -n deim python=3.11.9
-conda activate deim
-pip install -r requirements.txt
-```
+- 🔥 **Thermal-Optimized**: Handles colormap variations across multiple environmental temperatures
+- ⚡ **Simple API**: Train and infer in <10 lines of code (like Ultralytics YOLO)
+- 🎨 **Advanced Augmentation**: Mining-specific transforms (heat shimmer, dust, motion blur)
+- 📊 **Two-Stage Training**: Stage 1 (89%) with aggressive augmentation, Stage 2 (11%) clean fine-tuning
+- 🔄 **Auto-Scaling**: Epoch-dependent parameters automatically adjust with custom training lengths
+- 📁 **Format Support**: COCO (primary) and YOLO formats with bidirectional conversion
+- 🎥 **Multi-Input**: Images, videos, directories, batches
+- 👁️ **Visualization**: Built-in supervision package integration
+- 🏗️ **Multi-Architecture**: Test YOLO, RT-DETR, D-FINE, and custom models on same dataset
 
+## 📚 Documentation
 
-### Data Preparation
+| Document | Description |
+|----------|-------------|
+| 📖 [**QUICKSTART.md**](docs/QUICKSTART.md) | Get started in 3 steps - train on custom datasets in 5 minutes |
+| ⚙️ [**CONFIGURATION_REFERENCE.md**](docs/CONFIGURATION_REFERENCE.md) | Complete parameter reference for config files |
+| 🔄 [**FORMAT_CONVERSION.md**](docs/FORMAT_CONVERSION.md) | Convert between COCO and YOLO annotation formats |
 
-<details>
-<summary> COCO2017 Dataset </summary>
+---
 
-1. Download COCO2017 from [OpenDataLab](https://opendatalab.com/OpenDataLab/COCO_2017) or [COCO](https://cocodataset.org/#download).
-1. Modify paths in [coco_detection.yml](./configs/dataset/coco_detection.yml)
+## 🏗️ Architecture Overview
 
-    ```yaml
-    train_dataloader:
-        img_folder: /data/COCO2017/train2017/
-        ann_file: /data/COCO2017/annotations/instances_train2017.json
-    val_dataloader:
-        img_folder: /data/COCO2017/val2017/
-        ann_file: /data/COCO2017/annotations/instances_val2017.json
-    ```
+```mermaid
+graph LR
+    A[Input Image<br/>640x640] --> B[HGNetv2 Backbone<br/>Feature Extraction]
+    B --> C[Hybrid Encoder<br/>Multi-Scale Features]
+    C --> D[DFINE Transformer<br/>Decoder]
+    D --> E[Detection Head<br/>Boxes + Labels]
 
-</details>
+    style A fill:#e1f5ff
+    style B fill:#fff4e1
+    style C fill:#ffe1f5
+    style D fill:#e1ffe1
+    style E fill:#f5e1ff
+```
 
-<details>
-<summary>Custom Dataset</summary>
+### Component Deep-Dive
 
-To train on your custom dataset, you need to organize it in the COCO format. Follow the steps below to prepare your dataset:
+#### 1. **HGNetv2 Backbone** (PP-HGNetV2)
 
-1. **Set `remap_mscoco_category` to `False`:**
+**Purpose**: Efficient feature extraction optimized for GPU inference
 
-    This prevents the automatic remapping of category IDs to match the MSCOCO categories.
+**Key Innovations**:
+- **Learnable Affine Blocks (LAB)**: Adaptive feature scaling and biasing
+- **ESE Module**: Efficient Squeeze-and-Excitation for channel attention
+- **Light ConvBNAct**: Depthwise separable convolutions for efficiency
+- **Multi-Stage Design**: Progressive feature abstraction
 
-    ```yaml
-    remap_mscoco_category: False
-    ```
+**Architecture Flow**:
+```mermaid
+graph TD
+    A[Input 3x640x640] --> B[Stem Block<br/>3→16→16 channels]
+    B --> C[Stage 1<br/>16→64 channels<br/>No downsample]
+    C --> D[Stage 2<br/>64→256 channels<br/>Stride 2]
+    D --> E[Stage 3<br/>256→512 channels<br/>Stride 2<br/>Light blocks]
+    E --> F[Stage 4<br/>512→1024 channels<br/>Stride 2<br/>Light blocks]
 
-2. **Organize Images:**
+    E -.-> G[Output: 512 channels<br/>Feature stride 16]
+    F -.-> H[Output: 1024 channels<br/>Feature stride 32]
 
-    Structure your dataset directories as follows:
+    style G fill:#90EE90
+    style H fill:#90EE90
+```
 
-    ```shell
-    dataset/
-    ├── images/
-    │   ├── train/
-    │   │   ├── image1.jpg
-    │   │   ├── image2.jpg
-    │   │   └── ...
-    │   ├── val/
-    │   │   ├── image1.jpg
-    │   │   ├── image2.jpg
-    │   │   └── ...
-    └── annotations/
-        ├── instances_train.json
-        ├── instances_val.json
-        └── ...
-    ```
-
-    - **`images/train/`**: Contains all training images.
-    - **`images/val/`**: Contains all validation images.
-    - **`annotations/`**: Contains COCO-formatted annotation files.
-
-3. **Convert Annotations to COCO Format:**
-
-    If your annotations are not already in COCO format, you'll need to convert them. You can use the following Python script as a reference or utilize existing tools:
-
-    ```python
-    import json
-
-    def convert_to_coco(input_annotations, output_annotations):
-        # Implement conversion logic here
-        pass
-
-    if __name__ == "__main__":
-        convert_to_coco('path/to/your_annotations.json', 'dataset/annotations/instances_train.json')
-    ```
-
-4. **Update Configuration Files:**
-
-    Modify your [custom_detection.yml](./configs/dataset/custom_detection.yml).
-
-    ```yaml
-    task: detection
-
-    evaluator:
-      type: CocoEvaluator
-      iou_types: ['bbox', ]
-
-    num_classes: 777 # your dataset classes
-    remap_mscoco_category: False
-
-    train_dataloader:
-      type: DataLoader
-      dataset:
-        type: CocoDetection
-        img_folder: /data/yourdataset/train
-        ann_file: /data/yourdataset/train/train.json
-        return_masks: False
-        transforms:
-          type: Compose
-          ops: ~
-      shuffle: True
-      num_workers: 4
-      drop_last: True
-      collate_fn:
-        type: BatchImageCollateFunction
-
-    val_dataloader:
-      type: DataLoader
-      dataset:
-        type: CocoDetection
-        img_folder: /data/yourdataset/val
-        ann_file: /data/yourdataset/val/ann.json
-        return_masks: False
-        transforms:
-          type: Compose
-          ops: ~
-      shuffle: False
-      num_workers: 4
-      drop_last: False
-      collate_fn:
-        type: BatchImageCollateFunction
-    ```
-
-</details>
-
-
-## 3. Usage
-<details open>
-<summary> COCO2017 </summary>
-
-1. Training
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c configs/deim_dfine/deim_hgnetv2_${model}_coco.yml --use-amp --seed=0
+**Why HGNetv2 for Thermal Mining?**
+
+1. **Efficiency**: 7.1 GFLOPs / 3.5 GMACs - runs real-time on edge GPUs
+2. **Multi-Scale**: Extracts features at stride 16 & 32 for various object sizes
+3. **LAB Blocks**: Adapt to colormap variations (-10°C to 50°C environments)
+4. **Light Blocks**: Stages 3-4 use efficient depthwise convolutions
+5. **Pretrained**: ImageNet-pretrained for faster convergence
+
+**B0 Configuration** (used in `under` and `sides`):
+- **Parameters**: 3.7M
+- **Stem**: 3 → 16 → 16 channels
+- **Output Channels**: [512, 1024] at strides [16, 32]
+- **Kernel Sizes**: Stage 1-2 use 3×3, Stage 3-4 use 5×5
+- **Layer Depth**: 3 layers per HG_Block
+
+#### 2. **Hybrid Encoder**
+
+**Purpose**: Bridge backbone features to transformer decoder with cross-scale fusion
+
+**Configuration** (`under` & `sides`):
+```yaml
+in_channels: [512, 1024]    # From HGNetv2 stages 3-4
+feat_strides: [16, 32]      # Spatial resolutions
+hidden_dim: 128             # Compressed feature dimension
+dim_feedforward: 512        # FFN hidden size
+expansion: 0.34             # Cross-scale expansion ratio
+depth_mult: 0.5             # Network depth multiplier
 ```
 
-<!-- <summary>2. Testing </summary> -->
-2. Testing
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c configs/deim_dfine/deim_hgnetv2_${model}_coco.yml --test-only -r model.pth
+**Flow**:
+```mermaid
+graph LR
+    A[Backbone Features<br/>512@16 + 1024@32] --> B[Intra-Scale<br/>Self-Attention]
+    B --> C[Cross-Scale<br/>Feature Fusion]
+    C --> D[Output 128@16<br/>128@32]
+
+    style D fill:#90EE90
 ```
 
-<!-- <summary>3. Tuning </summary> -->
-3. Tuning
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c configs/deim_dfine/deim_hgnetv2_${model}_coco.yml --use-amp --seed=0 -t model.pth
+#### 3. **DFINE Transformer Decoder**
+
+**Purpose**: Detection-focused transformer with improved matching
+
+**Key Features**:
+- **Deformable Attention**: Adaptive sampling points (6 per level)
+- **Improved Matching**: Better query-target assignment
+- **Query Denoising**: Stabilizes training
+- **Multi-Level**: Processes features at 2 scales
+
+**Configuration**:
+```yaml
+num_layers: 3               # Decoder depth
+hidden_dim: 128            # Feature dimension
+num_queries: 300           # Detection queries
+num_points: [6, 6]         # Deformable attention points
+num_denoising: 100         # Denoising queries
 ```
-</details>
 
-<details>
-<summary> Customizing Batch Size </summary>
+---
+
+## 🚀 Quick Start
 
-For example, if you want to double the total batch size when training D-FINE-L on COCO2017, here are the steps you should follow:
+### Installation
 
-1. **Modify your [dataloader.yml](./configs/base/dataloader.yml)** to increase the `total_batch_size`:
+```bash
+# Clone repository
+cd /path/to/DEIM
 
-    ```yaml
-    train_dataloader:
-        total_batch_size: 64  # Previously it was 32, now doubled
-    ```
+# Install dependencies
+pip install -r requirements.txt
 
-2. **Modify your [deim_hgnetv2_l_coco.yml](./configs/deim_dfine/deim_hgnetv2_l_coco.yml)**. Here’s how the key parameters should be adjusted:
+# Install module
+pip install -e .
+```
 
-    ```yaml
-    optimizer:
-    type: AdamW
-    params:
-        -
-        params: '^(?=.*backbone)(?!.*norm|bn).*$'
-        lr: 0.000025  # doubled, linear scaling law
-        -
-        params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
-        weight_decay: 0.
+### Monitoring Training with TensorBoard
 
-    lr: 0.0005  # doubled, linear scaling law
-    betas: [0.9, 0.999]
-    weight_decay: 0.0001  # need a grid search
+```bash
+# Launch TensorBoard (from project root)
+tensorboard --logdir=deim_outputs
 
-    ema:  # added EMA settings
-        decay: 0.9998  # adjusted by 1 - (1 - decay) * 2
-        warmups: 500  # halved
+# Open browser to: http://localhost:6006
+# View real-time training metrics, losses, and validation results
+```
 
-    lr_warmup_scheduler:
-        warmup_duration: 250  # halved
-    ```
+**TensorBoard Features**:
+- 📊 Real-time loss curves (training & validation)
+- 📈 Learning rate schedules
+- 🎯 mAP/precision/recall metrics
+- 🖼️ Sample predictions with ground truth
 
-</details>
+### Training
 
+```python
+from deim import DEIM
 
-<details>
-<summary> Customizing Input Size </summary>
+# Initialize with config
+model = DEIM(config='under')  # or 'sides'
 
-If you'd like to train **DEIM** on COCO2017 with an input size of 320x320, follow these steps:
+# Train from scratch
+model.train(
+    epochs=400,           # Auto-scales to 356 (Stage 1) + 44 (Stage 2)
+    batch_size=32,
+    learning_rate=0.001
+)
 
-1. **Modify your [dataloader.yml](./configs/base/dataloader.yml)**:
+# Transfer learning
+model.train(
+    pretrained='base_model.pth',
+    epochs=100
+)
+```
 
-    ```yaml
+### Inference
+
+```python
+from deim import DEIM
+
+# Load trained model
+model = DEIM(config='under')
+model.load('deim_outputs/under/20241002_143022/best_stg1.pth')
+
+# Single image
+results = model.predict('truck_thermal.jpg', visualize=True)
+
+# Batch processing
+results = model.predict([
+    'image1.jpg',
+    'image2.jpg',
+    'image3.jpg'
+])
+
+# Video processing
+results = model.predict(
+    'thermal_video.mp4',
+    save_path='output_annotated.mp4',
+    conf_threshold=0.5
+)
+
+# Directory
+results = model.predict(
+    'thermal_images/',
+    save_dir='outputs/',
+    visualize=True
+)
+```
 
-    train_dataloader:
-    dataset:
-        transforms:
-            ops:
-                - {type: Resize, size: [320, 320], }
-    collate_fn:
-        base_size: 320
-    dataset:
-        transforms:
-            ops:
-                - {type: Resize, size: [320, 320], }
-    ```
+---
 
-2. **Modify your [dfine_hgnetv2.yml](./configs/base/dfine_hgnetv2.yml)**:
+## 📊 Two-Stage Training Strategy
 
-    ```yaml
-    eval_spatial_size: [320, 320]
-    ```
+```mermaid
+gantt
+    title Training Pipeline (400 Epochs Example)
+    dateFormat X
+    axisFormat %s
 
-</details>
+    section Stage 1: Augmentation
+    Thermal transforms active    :active, 0, 356
 
-## 4. Tools
-<details>
-<summary> Deployment </summary>
+    section Stage 2: Fine-tuning
+    Clean data only             :crit, 356, 44
 
-<!-- <summary>4. Export onnx </summary> -->
-1. Setup
-```shell
-pip install onnx onnxsim
+    section Milestones
+    Stop augmentation           :milestone, 356, 0
+    Training complete           :milestone, 400, 0
 ```
 
-2. Export onnx
-```shell
-python tools/deployment/export_onnx.py --check -c configs/deim_dfine/deim_hgnetv2_${model}_coco.yml -r model.pth
-```
+### Stage 1 (Epochs 1-356, 89%)
+
+**Purpose**: Robust feature learning under mining conditions
 
-3. Export [tensorrt](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html)
-```shell
-trtexec --onnx="model.onnx" --saveEngine="model.engine" --fp16
+**Active Augmentations**:
+```yaml
+- RandomPhotometricDistort (p=0.5)    # Colormap variations
+- GaussianBlur (p=0.3)                # Heat shimmer, dust, motion
+- RandomRotation (±10°, p=0.5)        # Vehicle angles
+- RandomPerspective (p=0.3)           # Camera position
+- RandomAdjustSharpness (p=0.3)       # Thermal focus variation
+- RandomZoomOut                        # Distance variation
+- RandomIoUCrop (p=0.8)               # Partial frame entry/exit
+- RandomHorizontalFlip                # Symmetry augmentation
 ```
 
-</details>
+### Stage 2 (Epochs 357-400, 11%)
 
-<details>
-<summary> Inference (Visualization) </summary>
+**Purpose**: Clean fine-tuning on exact thermal representations
 
+**Active**: Resize + Normalization only
 
-1. Setup
-```shell
-pip install -r tools/inference/requirements.txt
-```
+**Why This Works**:
+- Stage 1 teaches robustness to environmental variations
+- Stage 2 corrects overfitting and refines predictions
+- 89/11 split maximizes augmentation exposure while ensuring clean convergence
+
+---
+
+## 🎨 Thermal-Specific Augmentations
+
+### Environmental Challenges
+
+| Challenge | Temperature Range | Augmentation Solution |
+|-----------|------------------|----------------------|
+| **Ambient Variation** | -10°C to 50°C | RandomPhotometricDistort |
+| **Heat Shimmer** | Hot undercarriage | GaussianBlur |
+| **Dust/Particulates** | Mining roads | GaussianBlur |
+| **Vehicle Motion** | Fast-moving trucks | GaussianBlur + RandomRotation |
+| **Camera Mounting** | Slight misalignment | RandomPerspective |
+| **Thermal Focus** | Variable camera focus | RandomAdjustSharpness |
+| **Frame Entry/Exit** | Partial visibility | RandomIoUCrop |
 
+### Colourmap Handling
 
-<!-- <summary>5. Inference </summary> -->
-2. Inference (onnxruntime / tensorrt / torch)
+**Problem**: Environmental temperature (-10°C to 50°C) shifts the entire colormap, drastically changing object appearance.
 
-Inference on images and videos is now supported.
-```shell
-python tools/inference/onnx_inf.py --onnx model.onnx --input image.jpg  # video.mp4
-python tools/inference/trt_inf.py --trt model.engine --input image.jpg
-python tools/inference/torch_inf.py -c configs/deim_dfine/deim_hgnetv2_${model}_coco.yml -r model.pth --input image.jpg --device cuda:0
+**Solution**: RandomPhotometricDistort simulates these shifts during training, making the model colormap-invariant while preserving thermal signature recognition.
+
+---
+
+## 📁 Configuration
+
+### Pre-configured Models
+
+#### `under` - Undercarriage Detection
+```yaml
+Camera Position: Road-mounted, looking UP
+View: Vehicle undercarriage
+Objects: Engine, exhaust, drivetrain, hot spots
+Training: 320 epochs (285 aug + 35 clean)
+Batch Size: 8
 ```
-</details>
 
-<details>
-<summary> Benchmark </summary>
+#### `sides` - Side Profile Detection
+```yaml
+Camera Position: 90° to truck wheels
+View: Side profile
+Objects: Wheels, brakes, suspension, overheating components
+Training: 80 epochs (71 aug + 9 clean)
+Batch Size: 32
+```
 
-1. Setup
-```shell
-pip install -r tools/benchmark/requirements.txt
+### Training on Custom Datasets
+
+**Want to train DEIM on your own dataset?**
+
+📚 **See Documentation**: [QUICKSTART.md](docs/QUICKSTART.md) for 3-step setup
+
+**Quick example**:
+```python
+from deim import DEIM
+
+# 1. Create config files (see QUICKSTART.md)
+# 2. Train on your dataset
+model = DEIM(config='my_dataset')
+model.train(epochs=200)
+
+# 3. Run inference
+model.load('deim_outputs/my_dataset/best_stg2.pth')
+results = model.predict('my_image.jpg', visualize=True)
 ```
 
-<!-- <summary>6. Benchmark </summary> -->
-2. Model FLOPs, MACs, and Params
-```shell
-python tools/benchmark/get_info.py -c configs/deim_dfine/deim_hgnetv2_${model}_coco.yml
+**Runtime overrides**:
+```python
+model = DEIM(config='under')
+model.train(
+    epochs=500,              # Auto-scales to 445 (89%) + 55 (11%)
+    batch_size=16,
+    learning_rate=0.0005,
+    dataset_path='/custom/dataset'
+)
 ```
 
-2. TensorRT Latency
-```shell
-python tools/benchmark/trt_benchmark.py --COCO_dir path/to/COCO2017 --engine_dir model.engine
+---
+
+## 📂 Dataset Structure
+
+### COCO Format (Recommended)
+
+We use **COCO format** as the primary annotation format because:
+- ✅ Multi-model testing (YOLO, RT-DETR, D-FINE, etc.)
+- ✅ Industry standard for benchmarking
+- ✅ Rich metadata and standardized evaluation metrics
+- ✅ Framework compatibility (Detectron2, MMDetection, Ultralytics)
+
+```
+dataset/
+├── annotations/
+│   ├── instances_train.json
+│   └── instances_val.json
+└── images/
+    ├── train/
+    │   ├── thermal001.jpg
+    │   ├── thermal002.jpg
+    │   └── ...
+    └── val/
+        ├── thermal101.jpg
+        └── ...
 ```
-</details>
 
-<details>
-<summary> Fiftyone Visualization  </summary>
+### YOLO Format (Alternative)
 
-1. Setup
-```shell
-pip install fiftyone
 ```
-4. Voxel51 Fiftyone Visualization ([fiftyone](https://github.com/voxel51/fiftyone))
-```shell
-python tools/visualization/fiftyone_vis.py -c configs/deim_dfine/deim_hgnetv2_${model}_coco.yml -r model.pth
+dataset/
+├── train/
+│   ├── thermal001.jpg
+│   ├── thermal001.txt    # class x_center y_center width height
+│   ├── thermal002.jpg
+│   ├── thermal002.txt
+│   └── ...
+└── val/
+    ├── thermal101.jpg
+    ├── thermal101.txt
+    └── ...
 ```
-</details>
 
-<details>
-<summary> Others </summary>
+### Format Conversion
+
+Need to convert between formats? See **[FORMAT_CONVERSION.md](docs/FORMAT_CONVERSION.md)** for:
+- 🔄 COCO → YOLO conversion (Ultralytics built-in)
+- 🔄 YOLO → COCO conversion (with examples)
+- 📊 Format comparison and use cases
+
+---
+
+## 📈 Output Structure
 
-1. Auto Resume Training
-```shell
-bash reference/safe_training.sh
 ```
+deim_outputs/
+├── under/
+│   └── 20241002_143022/           # Timestamp
+│       ├── best_stg1.pth          # Stage 1 checkpoint
+│       ├── best_stg2.pth          # Stage 2 checkpoint (if applicable)
+│       ├── config.yml             # Training config snapshot
+│       └── logs/                  # TensorBoard logs
+└── sides/
+    └── 20241003_091530/
+        └── ...
+```
+
+---
+
+## 🔧 Development
+
+```bash
+# Linting
+ruff check .
+
+# Type checking
+pyright .
 
-2. Converting Model Weights
-```shell
-python reference/convert_weight.py model.pth
+# Run tests
+pytest tests/
 ```
-</details>
 
+### Environment
+
+- **Python**: 3.8+
+- **PyTorch**: 2.0+
+- **CUDA**: GPU required
+- **Interpreter**: `/home/hidara/miniconda3/envs/deim/bin/python`
+
+---
+
+## 📖 Architecture References
+
+### DEIM (DETR with Improved Matching)
+- Improved query matching mechanism over vanilla DETR
+- Faster convergence through better assignment
+- Multi-scale deformable attention
+
+### HGNetv2 (PP-HGNetV2)
+- **Paper**: PaddlePaddle/PaddleDetection
+- **Key Features**: Learnable Affine Blocks, ESE modules, lightweight design
+- **Performance**: SOTA efficiency-accuracy trade-off for real-time detection
+
+### D-FINE (Detection Transformer)
+- **Decoder**: Deformable attention with adaptive sampling
+- **Matching**: Hungarian matcher with improved cost functions
+- **Loss**: VFL (Varifocal Loss) + GIoU + Local feature loss
+
+---
+
+## 🎯 Use Case: Mining Vehicle Monitoring
+
+### Deployment Scenario
 
-## 5. Citation
-If you use `DEIM` or its methods in your work, please cite the following BibTeX entries:
-<details open>
-<summary> bibtex </summary>
+```mermaid
+graph TB
+    A[Road-Train Approaches] --> B{Camera System}
+    B --> C[Under Camera<br/>Road-mounted]
+    B --> D[Sides Camera<br/>90° mounted]
 
-```latex
-@misc{huang2024deim,
-      title={DEIM: DETR with Improved Matching for Fast Convergence},
-      author={Shihua Huang, Zhichao Lu, Xiaodong Cun, Yongjun Yu, Xiao Zhou, and Xi Shen},
-      booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
-      year={2025},
-}
+    C --> E[DEIM Inference<br/>under model]
+    D --> F[DEIM Inference<br/>sides model]
+
+    E --> G{Thermal Hotspots?}
+    F --> G
+
+    G -->|Yes| H[Alert: Overheating<br/>Component ID + Temp]
+    G -->|No| I[Vehicle OK<br/>Log + Continue]
+
+    style H fill:#ff6b6b
+    style I fill:#51cf66
 ```
-</details>
 
-## 6. Acknowledgement
-Our work is built upon [D-FINE](https://github.com/Peterande/D-FINE) and [RT-DETR](https://github.com/lyuwenyu/RT-DETR).
+### Critical Detection Targets
+
+| Camera | Objects of Interest | Temperature Range | Risk Level |
+|--------|-------------------|------------------|-----------|
+| **Under** | Engine, Exhaust, Drivetrain, Hydraulics | 80-200°C | Critical |
+| **Sides** | Brakes, Wheels, Suspension, Bearings | 60-150°C | High |
+
+---
+
+## 📄 License
+
+MIT License - See [LICENSE](LICENSE) file for details.
+
+---
+
+## 🙏 Acknowledgments
+
+- **DEIM Architecture**: D-FINE Authors
+- **HGNetv2 Backbone**: PaddlePaddle Team
+- **Thermal Optimization**: Mining environment adaptations
+- **Supervision**: Visualization package integration
+
+---
+
+## 📞 Support & Resources
+
+### Documentation
+- 📖 [Quick Start Guide](docs/QUICKSTART.md) - Setup in 5 minutes
+- ⚙️ [Configuration Reference](docs/CONFIGURATION_REFERENCE.md) - All parameters explained
+- 🔄 [Format Conversion](docs/FORMAT_CONVERSION.md) - COCO ↔ YOLO conversion
+
+### Getting Help
+- 💬 Open an issue on GitHub
+- 📧 Check existing documentation
+- 🔍 Review example scripts
+
+---
+
+<div align="center">
+
+**Built for thermal imaging in extreme mining environments**
+**Optimized for real-time GPU inference • Production-ready**
+
+🚛 🔥 ⚡
 
-✨ Feel free to contribute and reach out if you have any questions! ✨
+</div>
\ No newline at end of file
diff --git a/deim/__init__.py b/deim/__init__.py
new file mode 100644
index 00000000..8ddbeffd
--- /dev/null
+++ b/deim/__init__.py
@@ -0,0 +1,24 @@
+"""
+DEIM - DETR with Improved Matching
+
+A simple and powerful object detection module
+Similar to ultralytics but for DEIM models
+
+Example:
+    >>> from deim import DEIM
+    >>>
+    >>> # Train from scratch
+    >>> model = DEIM(config='under')
+    >>> model.train(epochs=100, batch_size=32)
+    >>>
+    >>> # Inference
+    >>> model = DEIM(config='under')
+    >>> model.load('path/to/checkpoint.pth')
+    >>> results = model.predict('image.jpg')
+"""
+
+__version__ = '1.0.0'
+
+from .api import DEIM
+
+__all__ = ['DEIM']
\ No newline at end of file
diff --git a/deim/_configs/_base/dataloader.yml b/deim/_configs/_base/dataloader.yml
new file mode 100644
index 00000000..e30b369a
--- /dev/null
+++ b/deim/_configs/_base/dataloader.yml
@@ -0,0 +1,44 @@
+# DataLoader configuration
+
+train_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        # Thermal-specific augmentations for mining environment
+        - {type: GaussianBlur, kernel_size: [3, 5], sigma: [0.1, 2.0], p: 0.3}  # Heat shimmer, dust, motion blur
+        - {type: RandomRotation, degrees: 10, p: 0.5}  # Vehicle angles, camera mounting variation
+        - {type: RandomPerspective, distortion_scale: 0.2, p: 0.3}  # Camera position variation
+        - {type: RandomAdjustSharpness, sharpness_factor: 2, p: 0.3}  # Thermal camera focus variation
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        name: stop_epoch
+        epoch: 64 # 89% of base 72 epochs - auto-scaled when epochs overridden
+        ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop', 'GaussianBlur', 'RandomRotation', 'RandomPerspective', 'RandomAdjustSharpness']
+
+  collate_fn:
+    type: BatchImageCollateFunction
+    base_size: 640
+    base_size_repeat: 3
+    stop_epoch: 64 # 89% of base 72 epochs - auto-scaled when epochs overridden
+
+  shuffle: True
+  total_batch_size: 32
+  num_workers: 4
+
+val_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+  shuffle: False
+  total_batch_size: 64
+  num_workers: 4
\ No newline at end of file
diff --git a/deim/_configs/_base/dataloader_sides.yml b/deim/_configs/_base/dataloader_sides.yml
new file mode 100644
index 00000000..db5a9369
--- /dev/null
+++ b/deim/_configs/_base/dataloader_sides.yml
@@ -0,0 +1,42 @@
+# DataLoader configuration for SIDES model (wheels - larger objects)
+
+train_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: RandomPhotometricDistort, p: 0.5}  # Thermal signature variations
+        - {type: RandomZoomOut, fill: 0}  # OK for large wheels
+        - {type: RandomIoUCrop, p: 0.8}  # OK for large objects
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}  # Left/right symmetry
+        # Thermal-specific augmentations for mining environment
+        - {type: GaussianBlur, kernel_size: [3, 5], sigma: [0.1, 2.0], p: 0.3}  # Heat shimmer, dust, motion blur
+        - {type: RandomAdjustSharpness, sharpness_factor: 2, p: 0.3}  # Thermal camera focus variation
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        name: stop_epoch
+        epoch: 71  # 89% of 80 epochs for sides
+        ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop', 'GaussianBlur', 'RandomAdjustSharpness']
+
+  collate_fn:
+    type: BatchImageCollateFunction
+    base_size: 640
+    base_size_repeat: 3
+    stop_epoch: 71  # 89% of 80 epochs for sides
+
+  shuffle: True
+  total_batch_size: 32
+  num_workers: 4
+
+val_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+  shuffle: False
+  total_batch_size: 128
+  num_workers: 4
diff --git a/deim/_configs/_base/dataloader_under.yml b/deim/_configs/_base/dataloader_under.yml
new file mode 100644
index 00000000..42d2d6b0
--- /dev/null
+++ b/deim/_configs/_base/dataloader_under.yml
@@ -0,0 +1,44 @@
+# DataLoader configuration for UNDER model (wheels→hubs→brakes - nested small objects)
+
+train_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: RandomPhotometricDistort, p: 0.5}  # Thermal signature variations (hot brakes, hub heat)
+        - {type: RandomZoomOut, fill: 0}  # Simulates varying truck distances from camera
+        - {type: RandomIoUCrop, p: 0.3}  # REDUCED from 0.8 - less aggressive to preserve nested hierarchy
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}  # Left/right wheel symmetry
+        # Thermal-specific augmentations for mining environment
+        - {type: GaussianBlur, kernel_size: [3, 5], sigma: [0.1, 2.0], p: 0.3}  # Heat shimmer from hot brakes, dust
+        - {type: RandomAdjustSharpness, sharpness_factor: 2, p: 0.3}  # Critical for hub bolt details
+        # REMOVED RandomRotation - circular wheels/hubs don't benefit, radially symmetric
+        # REMOVED RandomPerspective - distorts circular features, hurts bolt pattern detection
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        name: stop_epoch
+        epoch: 285  # 89% of 320 epochs for under
+        ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop', 'GaussianBlur', 'RandomAdjustSharpness']
+
+  collate_fn:
+    type: BatchImageCollateFunction
+    base_size: 640
+    base_size_repeat: 3
+    stop_epoch: 285  # 89% of 320 epochs for under
+
+  shuffle: True
+  total_batch_size: 8
+  num_workers: 4
+
+val_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640], }
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+  shuffle: False
+  total_batch_size: 128
+  num_workers: 4
diff --git a/deim/_configs/_base/dataset_sides.yml b/deim/_configs/_base/dataset_sides.yml
new file mode 100644
index 00000000..d31816b4
--- /dev/null
+++ b/deim/_configs/_base/dataset_sides.yml
@@ -0,0 +1,45 @@
+# Dataset configuration for 'sides' detection
+
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ["bbox"]
+
+num_classes: 2  
+class_names:
+  0: wheel
+  1: hub
+remap_mscoco_category: True
+
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /home/hidara/Documents/datasets/yolo_dataset_sides/train/images
+    ann_file: /home/hidara/Documents/datasets/yolo_dataset_sides/annotations/instances_train.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 8
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /home/hidara/Documents/datasets/yolo_dataset_sides/val/images
+    ann_file: /home/hidara/Documents/datasets/yolo_dataset_sides/annotations/instances_val.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 8
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
\ No newline at end of file
diff --git a/deim/_configs/_base/dataset_under.yml b/deim/_configs/_base/dataset_under.yml
new file mode 100644
index 00000000..a48bc9eb
--- /dev/null
+++ b/deim/_configs/_base/dataset_under.yml
@@ -0,0 +1,44 @@
+# Dataset configuration for 'under' detection
+
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ["bbox"]
+
+num_classes: 1  # your dataset classes + 1 (background)
+class_names:
+  0: brakes
+remap_mscoco_category: True
+
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /home/hidara/Documents/datasets/yolo_dataset_under/train/images
+    ann_file: /home/hidara/Documents/datasets/yolo_dataset_under/annotations/instances_train.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 8
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /home/hidara/Documents/datasets/yolo_dataset_under/val/images
+    ann_file: /home/hidara/Documents/datasets/yolo_dataset_under/annotations/instances_val.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 8
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
\ No newline at end of file
diff --git a/deim/_configs/_base/dfine_hgnetv2.yml b/deim/_configs/_base/dfine_hgnetv2.yml
new file mode 100644
index 00000000..be12c77e
--- /dev/null
+++ b/deim/_configs/_base/dfine_hgnetv2.yml
@@ -0,0 +1,85 @@
+# DFINE HGNetv2 base configuration
+
+task: detection
+
+model: DEIM
+criterion: DEIMCriterion
+postprocessor: PostProcessor
+
+use_focal_loss: True
+eval_spatial_size: [640, 640] # h w
+checkpoint_freq: 4    # save freq
+
+DEIM:
+  backbone: HGNetv2
+  encoder: HybridEncoder
+  decoder: DFINETransformer
+
+# Learning rate scheduler
+lrsheduler: flatcosine
+lr_gamma: 1
+warmup_iter: 500
+flat_epoch: 4000000
+no_aug_epoch: 0
+
+HGNetv2:
+  pretrained: True
+
+HybridEncoder:
+  in_channels: [512, 1024, 2048]
+  feat_strides: [8, 16, 32]
+
+  # intra
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  nhead: 8
+  dim_feedforward: 1024
+  dropout: 0.
+  enc_act: 'gelu'
+
+  # cross
+  expansion: 1.0
+  depth_mult: 1
+  act: 'silu'
+
+DFINETransformer:
+  feat_channels: [256, 256, 256]
+  feat_strides: [8, 16, 32]
+  hidden_dim: 256
+  num_levels: 3
+
+  num_layers: 6
+  eval_idx: -1
+  num_queries: 300
+
+  num_denoising: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0
+
+  # NEW
+  reg_max: 32
+  reg_scale: 4
+
+  # Auxiliary decoder layers dimension scaling
+  layer_scale: 1
+
+  num_points: [3, 6, 3]
+  cross_attn_method: default
+  query_select_method: default
+
+PostProcessor:
+  num_top_queries: 300
+
+DEIMCriterion:
+  weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2, loss_fgl: 0.15, loss_ddf: 1.5}
+  losses: ['vfl', 'boxes', 'local']
+  alpha: 0.75
+  gamma: 2.0
+  reg_max: 32
+
+  matcher:
+    type: HungarianMatcher
+    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
+    alpha: 0.25
+    gamma: 2.0
\ No newline at end of file
diff --git a/deim/_configs/_base/optimizer.yml b/deim/_configs/_base/optimizer.yml
new file mode 100644
index 00000000..d91444d1
--- /dev/null
+++ b/deim/_configs/_base/optimizer.yml
@@ -0,0 +1,35 @@
+# Optimizer configuration
+
+use_amp: True
+use_ema: True
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 1000
+  start: 0
+
+epoches: 72  # Note: using 'epoches' to match original config spelling
+clip_max_norm: 0.1
+
+optimizer:
+  type: AdamW
+  params:
+    -
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.0000125
+    -
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.00025
+  betas: [0.9, 0.999]
+  weight_decay: 0.000125
+
+lr_scheduler:
+  type: MultiStepLR
+  milestones: [500]
+  gamma: 0.1
+
+lr_warmup_scheduler:
+  type: LinearWarmup
+  warmup_duration: 500
\ No newline at end of file
diff --git a/deim/_configs/_base/runtime.yml b/deim/_configs/_base/runtime.yml
new file mode 100644
index 00000000..5edf9759
--- /dev/null
+++ b/deim/_configs/_base/runtime.yml
@@ -0,0 +1,19 @@
+# Runtime configuration
+
+print_freq: 100
+output_dir: './logs'
+checkpoint_freq: 12
+
+sync_bn: True
+find_unused_parameters: False
+
+use_amp: True
+scaler:
+  type: GradScaler
+  enabled: True
+
+use_ema: False
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 1000
\ No newline at end of file
diff --git a/deim/_configs/sides.yml b/deim/_configs/sides.yml
new file mode 100644
index 00000000..8b503720
--- /dev/null
+++ b/deim/_configs/sides.yml
@@ -0,0 +1,80 @@
+# DEIM Configuration for 'sides' detection
+# Adapted from _old/deim_model_sides.yml
+
+__include__:
+  [
+    "./_base/dataset_sides.yml",
+    "./_base/runtime.yml",
+    "./_base/dfine_hgnetv2.yml",
+    "./_base/dataloader_sides.yml",
+    "./_base/optimizer.yml",
+  ]
+
+output_dir: ./deim_outputs/sides
+
+DEIM:
+  backbone: HGNetv2
+
+HGNetv2:
+  name: "B0"
+  return_idx: [2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+HybridEncoder:
+  in_channels: [512, 1024]
+  feat_strides: [16, 32]
+
+  # intra
+  hidden_dim: 128
+  use_encoder_idx: [1]
+  dim_feedforward: 512
+
+  # cross
+  expansion: 0.34
+  depth_mult: 0.5
+
+DFINETransformer:
+  feat_channels: [128, 128]
+  feat_strides: [16, 32]
+  hidden_dim: 128
+  dim_feedforward: 512
+  num_levels: 2
+
+  num_layers: 3
+  eval_idx: -1
+
+  num_points: [6, 6]
+
+optimizer:
+  type: AdamW
+  params:
+    - params: "^(?=.*backbone)(?!.*norm|bn).*$"
+      lr: 0.0004
+    - params: "^(?=.*backbone)(?=.*norm|bn).*$"
+      lr: 0.0004
+      weight_decay: 0.
+    - params: "^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$"
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Training parameters
+epoches: 80  # Note: using 'epoches' to match original config spelling
+
+train_dataloader:
+  total_batch_size: 32
+  dataset:
+    transforms:
+      policy:
+        epoch: 71  # 89% of 80 epochs
+  collate_fn:
+    stop_epoch: 71  # 89% of 80 epochs
+    ema_restart_decay: 0.9999
+    base_size_repeat: ~
+
+val_dataloader:
+  total_batch_size: 128
\ No newline at end of file
diff --git a/deim/_configs/under.yml b/deim/_configs/under.yml
new file mode 100644
index 00000000..bd5bfb03
--- /dev/null
+++ b/deim/_configs/under.yml
@@ -0,0 +1,80 @@
+# DEIM Configuration for 'under' detection
+# Adapted from _old/deim_model_under.yml
+
+__include__:
+  [
+    "./_base/dataset_under.yml",
+    "./_base/runtime.yml",
+    "./_base/dfine_hgnetv2.yml",
+    "./_base/dataloader_under.yml",
+    "./_base/optimizer.yml",
+  ]
+
+output_dir: ./deim_outputs/under
+
+DEIM:
+  backbone: HGNetv2
+
+HGNetv2:
+  name: "B0"
+  return_idx: [2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+HybridEncoder:
+  in_channels: [512, 1024]
+  feat_strides: [16, 32]
+
+  # intra
+  hidden_dim: 128
+  use_encoder_idx: [1]
+  dim_feedforward: 512
+
+  # cross
+  expansion: 0.34
+  depth_mult: 0.5
+
+DFINETransformer:
+  feat_channels: [128, 128]
+  feat_strides: [16, 32]
+  hidden_dim: 128
+  dim_feedforward: 512
+  num_levels: 2
+
+  num_layers: 3
+  eval_idx: -1
+
+  num_points: [6, 6]
+
+optimizer:
+  type: AdamW
+  params:
+    - params: "^(?=.*backbone)(?!.*norm|bn).*$"
+      lr: 0.0004
+    - params: "^(?=.*backbone)(?=.*norm|bn).*$"
+      lr: 0.0004
+      weight_decay: 0.
+    - params: "^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$"
+      weight_decay: 0.
+
+  lr: 0.0008
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+# Training parameters
+epoches: 320  # Note: using 'epoches' to match original config spelling
+
+train_dataloader:
+  total_batch_size: 8
+  dataset:
+    transforms:
+      policy:
+        epoch: 285  # 89% of 320 epochs
+  collate_fn:
+    stop_epoch: 285  # 89% of 320 epochs
+    ema_restart_decay: 0.9999
+    base_size_repeat: ~
+
+val_dataloader:
+  total_batch_size: 128
\ No newline at end of file
diff --git a/deim/_core/__init__.py b/deim/_core/__init__.py
new file mode 100644
index 00000000..da88ec55
--- /dev/null
+++ b/deim/_core/__init__.py
@@ -0,0 +1 @@
+"""Core modules for DEIM"""
\ No newline at end of file
diff --git a/deim/_core/config.py b/deim/_core/config.py
new file mode 100644
index 00000000..77adcbd8
--- /dev/null
+++ b/deim/_core/config.py
@@ -0,0 +1,200 @@
+"""
+Configuration management for DEIM
+Handles loading configs for 'under', 'sides', or custom YAML files
+"""
+
+import os
+import sys
+from pathlib import Path
+from typing import Dict, Any, Optional
+import yaml
+import copy
+
+
+class ConfigManager:
+    """
+    Manages configuration for DEIM models
+
+    Handles:
+    - Loading predefined configs ('under', 'sides')
+    - Loading custom YAML configs
+    - Applying parameter overrides
+    - Dataset path resolution
+    """
+
+    def __init__(self, config: str):
+        """
+        Initialize config manager
+
+        Args:
+            config: 'under', 'sides', or path to custom YAML
+        """
+        self.config_type = config
+        self.base_config = self._load_base_config(config)
+        self.config = copy.deepcopy(self.base_config)
+
+    def _load_base_config(self, config: str) -> Dict[str, Any]:
+        """Load the base configuration"""
+
+        # Get config directory
+        config_dir = Path(__file__).parent.parent / "_configs"
+
+        if config == 'under':
+            config_path = config_dir / "under.yml"
+            dataset_name = "yolo_dataset_under"
+
+        elif config == 'sides':
+            config_path = config_dir / "sides.yml"
+            dataset_name = "yolo_dataset_sides"
+
+        else:
+            # Custom config path
+            config_path = Path(config)
+            if not config_path.exists():
+                raise FileNotFoundError(f"Config file not found: {config_path}")
+            dataset_name = None
+
+        # Load YAML config
+        with open(config_path, 'r') as f:
+            cfg = yaml.safe_load(f)
+
+        # Handle includes if present
+        if "__include__" in cfg:
+            includes = cfg.pop("__include__")
+            base_cfg = {}
+
+            for include_path in includes:
+                # Resolve include path relative to config file
+                if not Path(include_path).is_absolute():
+                    include_path = config_path.parent / include_path
+
+                # Check if the path exists
+                if include_path.exists():
+                    with open(include_path, 'r') as f:
+                        include_cfg = yaml.safe_load(f)
+                        if include_cfg:
+                            base_cfg = self._merge_configs(base_cfg, include_cfg)
+                else:
+                    print(f"Warning: Include file not found: {include_path}")
+
+            # Merge with main config
+            cfg = self._merge_configs(base_cfg, cfg)
+
+        # Set dataset path if using predefined configs
+        if dataset_name:
+            dataset_base = Path("/home/hidara/Documents/datasets")
+            cfg['dataset_path'] = str(dataset_base / dataset_name)
+
+            # Update dataset config paths
+            if 'Dataset' in cfg:
+                cfg['Dataset']['img_folder'] = str(dataset_base / dataset_name)
+                cfg['Dataset']['ann_file'] = str(dataset_base / dataset_name)
+
+            # Update train/val dataset paths
+            if 'Train' in cfg:
+                if 'Dataset' in cfg['Train']:
+                    cfg['Train']['Dataset']['img_folder'] = str(dataset_base / dataset_name / "train")
+                    cfg['Train']['Dataset']['ann_file'] = str(dataset_base / dataset_name / "train")
+
+            if 'Eval' in cfg:
+                if 'Dataset' in cfg['Eval']:
+                    cfg['Eval']['Dataset']['img_folder'] = str(dataset_base / dataset_name / "val")
+                    cfg['Eval']['Dataset']['ann_file'] = str(dataset_base / dataset_name / "val")
+
+        return cfg
+
+    def _merge_configs(self, base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
+        """Recursively merge two config dictionaries"""
+        result = copy.deepcopy(base)
+
+        for key, value in override.items():
+            if key in result and isinstance(result[key], dict) and isinstance(value, dict):
+                result[key] = self._merge_configs(result[key], value)
+            else:
+                result[key] = value
+
+        return result
+
+    def apply_overrides(self, overrides: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Apply parameter overrides to config
+
+        Args:
+            overrides: Dictionary of parameters to override
+
+        Returns:
+            Updated config dictionary
+        """
+        self.config = copy.deepcopy(self.base_config)
+
+        # Map common parameter names to config structure
+        # Note: The original config uses 'epoches' (with an 'e')
+        param_mapping = {
+            'epochs': 'epoches',  # Map to 'epoches' as used in original configs
+            'batch_size': 'train_dataloader.total_batch_size',
+            'learning_rate': 'optimizer.lr',
+            'dataset_path': 'dataset_path',
+            'output_dir': 'output_dir',
+        }
+
+        for key, value in overrides.items():
+            if key in param_mapping:
+                # Use mapped path
+                path = param_mapping[key]
+                self._set_nested(self.config, path, value)
+            else:
+                # Direct assignment
+                self.config[key] = value
+
+        # Auto-scale epoch-dependent parameters if epochs were overridden
+        if 'epochs' in overrides:
+            self._scale_epoch_dependent_params(overrides['epochs'])
+
+        return self.config
+
+    def _scale_epoch_dependent_params(self, new_epochs: int):
+        """
+        Automatically scale epoch-dependent parameters when epochs change
+
+        Args:
+            new_epochs: New number of epochs
+        """
+        # Calculate scaling factor (use ~89% of total epochs for stopping augmentation)
+        # Stage 1 (1-89%): Training with thermal augmentation
+        # Stage 2 (90-100%): Fine-tuning without augmentation
+        stop_epoch = int(new_epochs * 0.89)
+
+        # Scale transforms policy epoch (when data augmentation stops)
+        if 'train_dataloader' in self.config:
+            if 'dataset' in self.config['train_dataloader']:
+                if 'transforms' in self.config['train_dataloader']['dataset']:
+                    if 'policy' in self.config['train_dataloader']['dataset']['transforms']:
+                        self.config['train_dataloader']['dataset']['transforms']['policy']['epoch'] = stop_epoch
+                        print(f"  Auto-scaled transforms.policy.epoch: {stop_epoch} (89% of {new_epochs})")
+
+            # Scale collate_fn stop_epoch (when multi-scale training stops)
+            if 'collate_fn' in self.config['train_dataloader']:
+                if 'stop_epoch' in self.config['train_dataloader']['collate_fn']:
+                    self.config['train_dataloader']['collate_fn']['stop_epoch'] = stop_epoch
+                    print(f"  Auto-scaled collate_fn.stop_epoch: {stop_epoch} (89% of {new_epochs})")
+
+    def _set_nested(self, cfg: Dict[str, Any], path: str, value: Any):
+        """Set a nested config value using dot notation"""
+        keys = path.split('.')
+        current = cfg
+
+        for key in keys[:-1]:
+            if key not in current:
+                current[key] = {}
+            current = current[key]
+
+        current[keys[-1]] = value
+
+    def get_config(self) -> Dict[str, Any]:
+        """Get current configuration"""
+        return self.config
+
+    def save_config(self, path: str):
+        """Save current configuration to file"""
+        with open(path, 'w') as f:
+            yaml.dump(self.config, f, default_flow_style=False)
\ No newline at end of file
diff --git a/deim/_core/predictor.py b/deim/_core/predictor.py
new file mode 100644
index 00000000..8ec2c86b
--- /dev/null
+++ b/deim/_core/predictor.py
@@ -0,0 +1,457 @@
+"""
+Inference module for DEIM
+Handles prediction on images, videos, and batches
+"""
+
+import sys
+from pathlib import Path
+from typing import Union, List, Dict, Any, Optional
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+import numpy as np
+from PIL import Image
+import cv2
+
+
+class Predictor:
+    """
+    Inference handler for DEIM models
+
+    Handles:
+    - Single image inference
+    - Batch image inference
+    - Video inference
+    - Visualization with supervision
+    """
+
+    def __init__(self, config: Dict[str, Any], checkpoint_path: str, device: torch.device):
+        """
+        Initialize predictor with model
+
+        Args:
+            config: Configuration dictionary
+            checkpoint_path: Path to model checkpoint
+            device: PyTorch device
+        """
+        self.config = config
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+
+        # Load model
+        self.model = self._load_model(checkpoint_path)
+        self.model.eval()
+
+        # Set default image size
+        self.img_size = config.get('img_size', 640)
+
+        # Initialize supervision for visualization if available
+        try:
+            import supervision as sv
+            self.supervision_available = True
+            self.box_annotator = sv.BoxAnnotator()
+            self.label_annotator = sv.LabelAnnotator(smart_position=True)
+        except ImportError:
+            self.supervision_available = False
+            print("⚠️  Supervision not installed. Visualization disabled.")
+
+    def _load_model(self, checkpoint_path: str) -> nn.Module:
+        """Load model from checkpoint"""
+
+        # By importing these, we are registering the modules in the workspace
+        import deim._engine.backbone
+        import deim._engine.deim
+
+        try:
+            from deim._engine.core.yaml_config import YAMLConfig
+            import yaml
+            import tempfile
+            import os
+            from collections import OrderedDict
+
+            # YAMLConfig needs a config file path. We have a dict.
+            # So, we write the dict to a temporary file.
+            with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.yml') as f:
+                yaml.dump(self.config, f)
+                temp_config_path = f.name
+            
+            # The YAMLConfig class modifies the output_dir by adding a timestamp.
+            # To avoid creating unwanted directories, we can point it to a temp dir.
+            # However, let's first try without this and see if it's a problem.
+            # The config from the API should have 'output_dir' set.
+            
+            # Create YAMLConfig object. This will also handle model creation.
+            cfg = YAMLConfig(temp_config_path)
+            model = cfg.model
+            
+            os.remove(temp_config_path)
+
+            # Load the checkpoint
+            checkpoint = torch.load(checkpoint_path, map_location=self.device)
+
+            # Extract state dict from checkpoint
+            if 'ema' in checkpoint and checkpoint['ema'] is not None:
+                state_dict = checkpoint['ema']['module']
+                print("INFO: Loading EMA weights for inference.")
+            elif 'model' in checkpoint:
+                state_dict = checkpoint['model']
+            elif 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            else:
+                state_dict = checkpoint
+
+            # Clean state dict keys if they are from a parallelized model
+            new_state_dict = OrderedDict()
+            for k, v in state_dict.items():
+                name = k[7:] if k.startswith('module.') else k
+                new_state_dict[name] = v
+            
+            model.load_state_dict(new_state_dict)
+
+            model = model.to(self.device)
+            model.eval()
+
+            return model
+
+        except Exception as e:
+            print(f"❌ Failed to load model dynamically: {e}")
+            print("   Please ensure your checkpoint and config are compatible.")
+            raise e
+
+    def predict(self,
+                sources: Union[str, List[str]],
+                conf_threshold: float = 0.4,
+                visualize: bool = False,
+                save_path: Optional[str] = None,
+                save_dir: Optional[str] = None) -> Union[Dict, List[Dict]]:
+        """
+        Run inference on sources
+
+        Args:
+            sources: Image path(s) or video path
+            conf_threshold: Confidence threshold
+            visualize: Whether to visualize results
+            save_path: Path to save single output
+            save_dir: Directory to save batch outputs
+
+        Returns:
+            Detection results
+        """
+
+        # Handle different source types
+        if isinstance(sources, str):
+            source_path = Path(sources)
+
+            if source_path.suffix.lower() in ['.mp4', '.avi', '.mov']:
+                # Video inference
+                return self._predict_video(
+                    sources, conf_threshold, visualize, save_path
+                )
+            else:
+                # Single image
+                results = self._predict_image(
+                    sources, conf_threshold, visualize
+                )
+
+                if visualize and save_path:
+                    self._save_image(results['visualization'], save_path)
+
+                return results
+
+        elif isinstance(sources, list):
+            # Batch inference
+            return self._predict_batch(
+                sources, conf_threshold, visualize, save_dir
+            )
+
+    def _predict_image(self,
+                      image_path: str,
+                      conf_threshold: float,
+                      visualize: bool) -> Dict[str, Any]:
+        """Predict on single image"""
+
+        # Load and preprocess image
+        image = Image.open(image_path).convert('RGB')
+        image_np = np.array(image)
+
+        # Resize image
+        resized, scale, pad_info = self._resize_image(image_np, self.img_size)
+
+        # Convert to tensor
+        transform = T.Compose([T.ToTensor()])
+        image_tensor = transform(resized).unsqueeze(0).to(self.device)
+
+        # Run inference
+        with torch.no_grad():
+            outputs = self.model(image_tensor)
+
+        # Process outputs - pass original image shape and padding info
+        orig_h, orig_w = image_np.shape[:2]
+        results = self._process_outputs(outputs, (orig_w, orig_h), scale, pad_info, conf_threshold)
+
+        # Add visualization if requested
+        if visualize and self.supervision_available:
+            results['visualization'] = self._visualize_detections(
+                image_np, results
+            )
+
+        results['image_path'] = image_path
+        results['image_size'] = image_np.shape[:2]
+
+        return results
+
+    def _predict_batch(self,
+                      image_paths: List[str],
+                      conf_threshold: float,
+                      visualize: bool,
+                      save_dir: Optional[str]) -> List[Dict[str, Any]]:
+        """Predict on batch of images"""
+
+        results = []
+
+        for idx, image_path in enumerate(image_paths):
+            print(f"  Processing {idx + 1}/{len(image_paths)}: {image_path}")
+
+            result = self._predict_image(image_path, conf_threshold, visualize)
+            results.append(result)
+
+            if visualize and save_dir and 'visualization' in result:
+                save_path = Path(save_dir) / f"{Path(image_path).stem}_pred.jpg"
+                self._save_image(result['visualization'], str(save_path))
+
+        return results
+
+    def _predict_video(self,
+                      video_path: str,
+                      conf_threshold: float,
+                      visualize: bool,
+                      save_path: Optional[str]) -> Dict[str, Any]:
+        """Predict on video"""
+
+        cap = cv2.VideoCapture(video_path)
+        fps = int(cap.get(cv2.CAP_PROP_FPS))
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+        results = {
+            'video_path': video_path,
+            'fps': fps,
+            'resolution': (width, height),
+            'total_frames': total_frames,
+            'frame_results': []
+        }
+
+        # Setup video writer if saving
+        if visualize and save_path:
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            out = cv2.VideoWriter(save_path, fourcc, fps, (width, height))
+
+        frame_idx = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+
+            # Convert BGR to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+
+            # Process frame
+            frame_result = self._process_frame(
+                frame_rgb, conf_threshold, visualize
+            )
+            frame_result['frame_idx'] = frame_idx
+            results['frame_results'].append(frame_result)
+
+            # Write visualized frame if requested
+            if visualize and save_path and 'visualization' in frame_result:
+                vis_frame = frame_result['visualization']
+                vis_frame_bgr = cv2.cvtColor(vis_frame, cv2.COLOR_RGB2BGR)
+                out.write(vis_frame_bgr)
+
+            frame_idx += 1
+
+            if frame_idx % 30 == 0:
+                print(f"    Processed {frame_idx}/{total_frames} frames")
+
+        cap.release()
+        if visualize and save_path:
+            out.release()
+
+        print(f"  ✓ Processed {frame_idx} frames")
+
+        return results
+
+    def _process_frame(self,
+                      frame: np.ndarray,
+                      conf_threshold: float,
+                      visualize: bool) -> Dict[str, Any]:
+        """Process single video frame"""
+
+        # Resize frame
+        resized, scale = self._resize_image(frame, self.img_size)
+
+        # Convert to tensor
+        transform = T.Compose([T.ToTensor()])
+        frame_tensor = transform(resized).unsqueeze(0).to(self.device)
+
+        # Run inference
+        with torch.no_grad():
+            outputs = self.model(frame_tensor)
+
+        # Process outputs
+        results = self._process_outputs(outputs, scale, conf_threshold)
+
+        # Add visualization if requested
+        if visualize and self.supervision_available:
+            results['visualization'] = self._visualize_detections(frame, results)
+
+        return results
+
+    def _resize_image(self, image: np.ndarray, target_size: int):
+        """Resize image maintaining aspect ratio
+
+        Returns:
+            padded: Padded square image
+            scale: Scaling factor applied
+            pad_info: Dict with padding offsets {'top', 'left'}
+        """
+
+        h, w = image.shape[:2]
+        scale = target_size / max(h, w)
+
+        new_w = int(w * scale)
+        new_h = int(h * scale)
+
+        resized = cv2.resize(image, (new_w, new_h))
+
+        # Pad to square
+        pad_w = target_size - new_w
+        pad_h = target_size - new_h
+
+        top = pad_h // 2
+        bottom = pad_h - top
+        left = pad_w // 2
+        right = pad_w - left
+
+        padded = cv2.copyMakeBorder(
+            resized, top, bottom, left, right,
+            cv2.BORDER_CONSTANT, value=(114, 114, 114)
+        )
+
+        pad_info = {'top': top, 'left': left}
+        return padded, scale, pad_info
+
+    def _process_outputs(self,
+                        outputs: Dict[str, torch.Tensor],
+                        orig_size: tuple,
+                        scale: float,
+                        pad_info: dict,
+                        conf_threshold: float) -> Dict[str, Any]:
+        """Process model outputs to detection format
+
+        Args:
+            outputs: Model outputs dict with 'pred_logits' and 'pred_boxes'
+            orig_size: Original image size as (width, height) - COCO format
+            scale: Scaling factor used during resize
+            pad_info: Padding information {'top': int, 'left': int}
+            conf_threshold: Confidence threshold for filtering
+        """
+
+        # DEIM returns dict with 'pred_logits' and 'pred_boxes'
+        # Boxes are relative to the padded 640x640 image
+        # Need to: 1) scale to 640x640, 2) remove padding, 3) scale to original
+
+        # Import postprocessor
+        from deim._engine.deim.postprocessor import PostProcessor
+
+        # Initialize postprocessor if not already done
+        if not hasattr(self, 'postprocessor'):
+            self.postprocessor = PostProcessor(
+                num_classes=self.config.get('num_classes', 80),
+                use_focal_loss=self.config.get('use_focal_loss', True),
+                num_top_queries=self.config.get('num_top_queries', 300)
+            )
+            # Enable deploy mode for tuple output
+            self.postprocessor.deploy()
+
+        # Use padded image size (640x640) for postprocessor
+        batch_size = outputs['pred_logits'].shape[0]
+        padded_size = torch.tensor([[self.img_size, self.img_size]] * batch_size,
+                                   dtype=torch.float32,
+                                   device=self.device)
+
+        # Apply postprocessor - returns (labels, boxes, scores) when in deploy mode
+        # Boxes are now in pixel coordinates relative to 640x640 padded image
+        labels, boxes, scores = self.postprocessor(outputs, padded_size)
+
+        # Convert to numpy and get first batch element
+        labels = labels[0].cpu().numpy()
+        boxes = boxes[0].cpu().numpy()
+        scores = scores[0].cpu().numpy()
+
+        # Remove padding offset - boxes are in xyxy format
+        # [x1, y1, x2, y2] relative to padded image
+        boxes[:, [0, 2]] -= pad_info['left']  # x coordinates
+        boxes[:, [1, 3]] -= pad_info['top']   # y coordinates
+
+        # Scale back to original image size
+        # Boxes are currently relative to resized image (after removing padding)
+        boxes /= scale
+
+        # Apply confidence threshold
+        mask = scores > conf_threshold
+        boxes = boxes[mask]
+        scores = scores[mask]
+        labels = labels[mask]
+
+        return {
+            'boxes': boxes,
+            'scores': scores,
+            'labels': labels,
+            'num_detections': len(boxes)
+        }
+
+    def _visualize_detections(self,
+                            image: np.ndarray,
+                            results: Dict[str, Any]) -> np.ndarray:
+        """Visualize detections using supervision"""
+
+        if not self.supervision_available:
+            return image
+
+        try:
+            import supervision as sv
+
+            # Create detections object
+            detections = sv.Detections(
+                xyxy=results['boxes'],
+                confidence=results['scores'],
+                class_id=results['labels'].astype(int)
+            )
+
+            # Get class names (if available in config)
+            class_names = self.config.get('class_names', {})
+            labels = [
+                f"{class_names.get(int(class_id), f'Class {class_id}')} {score:.2f}"
+                for class_id, score in zip(results['labels'], results['scores'])
+            ]
+
+            # Annotate image
+            annotated = self.box_annotator.annotate(
+                scene=image.copy(), detections=detections
+            )
+            annotated = self.label_annotator.annotate(
+                scene=annotated, detections=detections, labels=labels
+            )
+
+            return annotated
+
+        except Exception as e:
+            print(f"Visualization error: {str(e)}")
+            return image
+
+    def _save_image(self, image: np.ndarray, save_path: str):
+        """Save image to file"""
+        Image.fromarray(image).save(save_path)
+        print(f"  Saved: {save_path}")
\ No newline at end of file
diff --git a/deim/_core/trainer.py b/deim/_core/trainer.py
new file mode 100644
index 00000000..7269b0dd
--- /dev/null
+++ b/deim/_core/trainer.py
@@ -0,0 +1,213 @@
+"""
+Training orchestration for DEIM
+Wraps the existing training logic from _old/train.py
+"""
+
+import os
+import sys
+import subprocess
+from pathlib import Path
+from typing import Dict, Any, Optional
+import torch
+import yaml
+
+
+class Trainer:
+    """
+    Training orchestrator for DEIM models
+
+    Handles training workflow similar to new_run.py but through Python API
+    """
+
+    def __init__(self, config: Dict[str, Any], device: torch.device, pretrained: Optional[str] = None):
+        """
+        Initialize trainer
+
+        Args:
+            config: Configuration dictionary
+            device: PyTorch device
+            pretrained: Path to pretrained weights or None
+        """
+        self.config = config
+        self.device = device
+        self.pretrained = pretrained
+
+        # Get paths
+        self.deim_root = Path(__file__).parent.parent
+        self.train_script = self.deim_root / "_engine" / "train.py"
+
+        # Ensure train script exists
+        if not self.train_script.exists():
+            raise FileNotFoundError(f"Training script not found: {self.train_script}")
+
+    def train(self, output_dir: str) -> Dict[str, Any]:
+        """
+        Run training process
+
+        Args:
+            output_dir: Directory to save outputs
+
+        Returns:
+            Dictionary with training results
+        """
+
+        # Create output directory
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+
+        # Save config for this training run
+        config_save_path = output_path / "config.yml"
+        with open(config_save_path, 'w') as f:
+            yaml.dump(self.config, f, default_flow_style=False)
+
+        # Build training command (similar to new_run.py)
+        # Use absolute paths for config and output
+        cmd = [
+            sys.executable,  # Use current Python interpreter
+            "-m", "torch.distributed.run",
+            "--master_port=7777",
+            "--nproc_per_node=1",
+            "train.py",  # Use relative path since we're running from _engine
+            "-c", str(config_save_path.absolute()),  # Use absolute path
+            "--use-amp",
+            "--seed=0",
+        ]
+
+        # Add pretrained weights if provided
+        if self.pretrained:
+            # Convert to absolute path if relative
+            pretrained_path = Path(self.pretrained)
+            if not pretrained_path.is_absolute():
+                pretrained_path = pretrained_path.absolute()
+            cmd.extend(["-t", str(pretrained_path)])
+
+        # Set output directory in command (absolute path)
+        cmd.extend(["--output-dir", str(output_path.absolute())])
+
+        # Set environment variables
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = str(self.device.index) if self.device.index is not None else "0"
+
+        print("\n🚀 Launching training process...")
+        print(f"Command: {' '.join(cmd)}")
+
+        try:
+            # Run training from _engine directory
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                bufsize=1,
+                env=env,
+                cwd=str(self.deim_root / "_engine")  # Run from _engine directory
+            )
+
+            # Stream output in real-time
+            for line in process.stdout:
+                print(line.rstrip())
+
+            # Wait for completion
+            return_code = process.wait()
+
+            if return_code != 0:
+                raise RuntimeError(f"Training failed with return code {return_code}")
+
+            print("\n✓ Training completed successfully")
+
+            # Collect results
+            results = {
+                'output_dir': str(output_path),
+                'config_path': str(config_save_path),
+                'best_model': str(output_path / "best_stg1.pth"),
+            }
+
+            # Check for stage 2 model
+            stage2_path = output_path / "best_stg2.pth"
+            if stage2_path.exists():
+                results['best_model_stage2'] = str(stage2_path)
+
+            return results
+
+        except Exception as e:
+            print(f"\n❌ Training failed: {str(e)}")
+            raise
+
+
+# Alternative implementation using direct module import (if subprocess doesn't work)
+class DirectTrainer:
+    """
+    Direct training using module imports instead of subprocess
+    This is a backup implementation
+    """
+
+    def __init__(self, config: Dict[str, Any], device: torch.device, pretrained: Optional[str] = None):
+        """Initialize direct trainer"""
+        self.config = config
+        self.device = device
+        self.pretrained = pretrained
+
+    def train(self, output_dir: str) -> Dict[str, Any]:
+        """
+        Run training directly by importing modules
+
+        Note: This is more complex as it requires setting up the training
+        environment properly. The subprocess approach above is preferred.
+        """
+
+        # Add engine to path
+        engine_path = Path(__file__).parent.parent / "_engine"
+        if str(engine_path) not in sys.path:
+            sys.path.insert(0, str(engine_path))
+
+        # Import training modules
+        from core import YAMLConfig
+        from solver import TASKS
+
+        # Create output directory
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+
+        # Update config
+        self.config['output_dir'] = str(output_path)
+
+        # Save config
+        config_path = output_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(self.config, f, default_flow_style=False)
+
+        # Create args-like object
+        class Args:
+            def __init__(self, config_path, pretrained=None):
+                self.config = config_path
+                self.tuning = pretrained
+                self.resume = None
+                self.test_only = False
+                self.use_amp = True
+                self.seed = 0
+                self.print_rank = 0
+                self.print_method = 'builtin'
+                self.output_dir = str(output_path)
+
+        args = Args(str(config_path), self.pretrained)
+
+        # Load config
+        cfg = YAMLConfig(args.config)
+
+        # Get task (detector)
+        task = TASKS.get(cfg.get('task', 'detection'))
+
+        # Create solver
+        solver = task(cfg, args)
+
+        # Run training
+        solver.train()
+
+        # Return results
+        results = {
+            'output_dir': str(output_path),
+            'config_path': str(config_path),
+            'best_model': str(output_path / "best_stg1.pth"),
+        }
+
+        return results
\ No newline at end of file
diff --git a/deim/_data/__init__.py b/deim/_data/__init__.py
new file mode 100644
index 00000000..a32280cb
--- /dev/null
+++ b/deim/_data/__init__.py
@@ -0,0 +1,7 @@
+"""Data handling modules for DEIM"""
+
+from .dataset import Dataset
+from .dataloader import DataLoader
+from .transforms import Transforms, get_transform
+
+__all__ = ['Dataset', 'DataLoader', 'Transforms', 'get_transform']
\ No newline at end of file
diff --git a/deim/_data/dataloader.py b/deim/_data/dataloader.py
new file mode 100644
index 00000000..305ce7b9
--- /dev/null
+++ b/deim/_data/dataloader.py
@@ -0,0 +1,79 @@
+"""
+DataLoader wrapper for DEIM
+Handles batch creation and collation for object detection
+"""
+
+import torch
+from torch.utils.data import DataLoader as TorchDataLoader
+from typing import Dict, Any, Optional, List
+
+
+def collate_fn(batch):
+    """
+    Custom collate function for object detection
+
+    Handles variable number of objects per image
+    """
+    images = []
+    targets = []
+
+    for image, target in batch:
+        images.append(image)
+        targets.append(target)
+
+    # Stack images into batch
+    images = torch.stack(images, 0)
+
+    return images, targets
+
+
+class DataLoader:
+    """
+    DataLoader wrapper for DEIM training and inference
+
+    Provides sensible defaults for object detection
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size: int = 32,
+                 shuffle: bool = False,
+                 num_workers: int = 4,
+                 pin_memory: bool = True,
+                 drop_last: bool = False,
+                 **kwargs):
+        """
+        Initialize DataLoader
+
+        Args:
+            dataset: Dataset object
+            batch_size: Batch size
+            shuffle: Whether to shuffle data
+            num_workers: Number of data loading workers
+            pin_memory: Whether to pin memory for GPU transfer
+            drop_last: Whether to drop last incomplete batch
+            **kwargs: Additional arguments for PyTorch DataLoader
+        """
+
+        self.dataset = dataset
+        self.batch_size = batch_size
+
+        # Create PyTorch DataLoader
+        self.loader = TorchDataLoader(
+            dataset=dataset,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            drop_last=drop_last,
+            collate_fn=collate_fn,
+            **kwargs
+        )
+
+    def __iter__(self):
+        """Iterate through batches"""
+        return iter(self.loader)
+
+    def __len__(self):
+        """Get number of batches"""
+        return len(self.loader)
\ No newline at end of file
diff --git a/deim/_data/dataset.py b/deim/_data/dataset.py
new file mode 100644
index 00000000..cdd39c9a
--- /dev/null
+++ b/deim/_data/dataset.py
@@ -0,0 +1,214 @@
+"""
+Dataset handling for DEIM
+Supports COCO and YOLO annotation formats with auto-detection
+"""
+
+import os
+import json
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Tuple
+import torch
+from torch.utils.data import Dataset as TorchDataset
+from PIL import Image
+import numpy as np
+
+
+class Dataset(TorchDataset):
+    """
+    DEIM Dataset handler
+
+    Auto-detects COCO vs YOLO format and loads accordingly
+    """
+
+    def __init__(self,
+                 data_path: str,
+                 split: str = 'train',
+                 transform: Optional[Any] = None):
+        """
+        Initialize dataset
+
+        Args:
+            data_path: Path to dataset root
+            split: Dataset split ('train', 'val', 'test')
+            transform: Optional transforms to apply
+        """
+        self.data_path = Path(data_path)
+        self.split = split
+        self.transform = transform
+
+        # Auto-detect format
+        self.format = self._detect_format()
+
+        # Load annotations
+        if self.format == 'coco':
+            self._load_coco()
+        elif self.format == 'yolo':
+            self._load_yolo()
+        else:
+            raise ValueError(f"Unknown dataset format at {data_path}")
+
+    def _detect_format(self) -> str:
+        """Auto-detect dataset format"""
+
+        # Check for COCO format
+        coco_ann = self.data_path / 'annotations' / f'{self.split}.json'
+        if coco_ann.exists():
+            return 'coco'
+
+        # Check for YOLO format
+        yolo_dir = self.data_path / self.split
+        if yolo_dir.exists():
+            # Look for .txt label files
+            label_files = list(yolo_dir.glob('*.txt'))
+            if label_files:
+                return 'yolo'
+
+        return 'unknown'
+
+    def _load_coco(self):
+        """Load COCO format annotations"""
+        ann_file = self.data_path / 'annotations' / f'{self.split}.json'
+
+        with open(ann_file, 'r') as f:
+            coco_data = json.load(f)
+
+        self.images = coco_data['images']
+        self.annotations = coco_data['annotations']
+        self.categories = {cat['id']: cat['name']
+                          for cat in coco_data['categories']}
+
+        # Group annotations by image
+        self.img_to_anns = {}
+        for ann in self.annotations:
+            img_id = ann['image_id']
+            if img_id not in self.img_to_anns:
+                self.img_to_anns[img_id] = []
+            self.img_to_anns[img_id].append(ann)
+
+    def _load_yolo(self):
+        """Load YOLO format annotations"""
+        split_dir = self.data_path / self.split
+
+        # Get all image files
+        image_files = []
+        for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp']:
+            image_files.extend(split_dir.glob(ext))
+
+        self.images = []
+        self.annotations = []
+
+        for img_path in image_files:
+            # Get corresponding label file
+            label_path = img_path.with_suffix('.txt')
+
+            if label_path.exists():
+                # Read YOLO format labels
+                with open(label_path, 'r') as f:
+                    lines = f.readlines()
+
+                boxes = []
+                labels = []
+
+                for line in lines:
+                    parts = line.strip().split()
+                    if len(parts) >= 5:
+                        class_id = int(parts[0])
+                        # YOLO format: x_center, y_center, width, height (normalized)
+                        bbox = [float(x) for x in parts[1:5]]
+
+                        labels.append(class_id)
+                        boxes.append(bbox)
+
+                self.images.append({
+                    'file_name': img_path.name,
+                    'path': str(img_path)
+                })
+
+                self.annotations.append({
+                    'boxes': boxes,
+                    'labels': labels
+                })
+
+    def __len__(self) -> int:
+        """Get dataset length"""
+        return len(self.images)
+
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, Dict]:
+        """
+        Get item by index
+
+        Returns:
+            Tuple of (image, target) where target contains boxes and labels
+        """
+
+        if self.format == 'coco':
+            return self._get_coco_item(idx)
+        else:
+            return self._get_yolo_item(idx)
+
+    def _get_coco_item(self, idx: int) -> Tuple[torch.Tensor, Dict]:
+        """Get COCO format item"""
+        img_info = self.images[idx]
+        img_path = self.data_path / 'images' / img_info['file_name']
+
+        # Load image
+        image = Image.open(img_path).convert('RGB')
+
+        # Get annotations
+        img_id = img_info['id']
+        anns = self.img_to_anns.get(img_id, [])
+
+        boxes = []
+        labels = []
+
+        for ann in anns:
+            # Convert from COCO bbox format (x, y, w, h) to (x1, y1, x2, y2)
+            x, y, w, h = ann['bbox']
+            boxes.append([x, y, x + w, y + h])
+            labels.append(ann['category_id'])
+
+        target = {
+            'boxes': torch.as_tensor(boxes, dtype=torch.float32),
+            'labels': torch.as_tensor(labels, dtype=torch.int64)
+        }
+
+        # Apply transforms
+        if self.transform:
+            image, target = self.transform(image, target)
+
+        return image, target
+
+    def _get_yolo_item(self, idx: int) -> Tuple[torch.Tensor, Dict]:
+        """Get YOLO format item"""
+        img_info = self.images[idx]
+
+        # Load image
+        image = Image.open(img_info['path']).convert('RGB')
+        w, h = image.size
+
+        # Get annotations
+        ann = self.annotations[idx]
+
+        boxes = []
+        for box in ann['boxes']:
+            # Convert from YOLO format (x_center, y_center, width, height) normalized
+            # to absolute (x1, y1, x2, y2)
+            x_center, y_center, width, height = box
+
+            x1 = (x_center - width/2) * w
+            y1 = (y_center - height/2) * h
+            x2 = (x_center + width/2) * w
+            y2 = (y_center + height/2) * h
+
+            boxes.append([x1, y1, x2, y2])
+
+        target = {
+            'boxes': torch.as_tensor(boxes, dtype=torch.float32),
+            'labels': torch.as_tensor(ann['labels'], dtype=torch.int64)
+        }
+
+        # Apply transforms
+        if self.transform:
+            image, target = self.transform(image, target)
+
+        return image, target
\ No newline at end of file
diff --git a/deim/_data/transforms.py b/deim/_data/transforms.py
new file mode 100644
index 00000000..c0406493
--- /dev/null
+++ b/deim/_data/transforms.py
@@ -0,0 +1,159 @@
+"""
+Transforms for DEIM training and inference
+Data augmentation and preprocessing
+"""
+
+import torch
+import torchvision.transforms as T
+from typing import Tuple, Dict, Any, Optional
+import random
+import numpy as np
+from PIL import Image
+
+
+class Transforms:
+    """
+    Transform pipeline for DEIM
+
+    Handles image preprocessing and augmentation
+    """
+
+    def __init__(self,
+                 img_size: int = 640,
+                 augment: bool = False,
+                 normalize: bool = True):
+        """
+        Initialize transforms
+
+        Args:
+            img_size: Target image size
+            augment: Whether to apply augmentation (for training)
+            normalize: Whether to normalize images
+        """
+        self.img_size = img_size
+        self.augment = augment
+        self.normalize = normalize
+
+        # Basic transforms
+        self.resize = T.Resize((img_size, img_size))
+        self.to_tensor = T.ToTensor()
+
+        # Normalization (ImageNet stats)
+        if normalize:
+            self.norm = T.Normalize(
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]
+            )
+
+    def __call__(self,
+                 image: Image.Image,
+                 target: Dict[str, Any]) -> Tuple[torch.Tensor, Dict]:
+        """
+        Apply transforms to image and target
+
+        Args:
+            image: PIL Image
+            target: Dictionary with 'boxes' and 'labels'
+
+        Returns:
+            Transformed image and target
+        """
+
+        # Get original size
+        w, h = image.size
+
+        # Apply augmentation if enabled
+        if self.augment:
+            image, target = self._augment(image, target)
+
+        # Resize image
+        image = self.resize(image)
+
+        # Scale boxes to match resized image
+        if 'boxes' in target and len(target['boxes']) > 0:
+            boxes = target['boxes']
+            # Scale boxes
+            scale_x = self.img_size / w
+            scale_y = self.img_size / h
+
+            boxes[:, [0, 2]] *= scale_x
+            boxes[:, [1, 3]] *= scale_y
+
+            target['boxes'] = boxes
+
+        # Convert to tensor
+        image = self.to_tensor(image)
+
+        # Normalize if enabled
+        if self.normalize:
+            image = self.norm(image)
+
+        return image, target
+
+    def _augment(self,
+                 image: Image.Image,
+                 target: Dict) -> Tuple[Image.Image, Dict]:
+        """Apply data augmentation"""
+
+        # Random horizontal flip
+        if random.random() < 0.5:
+            image, target = self._horizontal_flip(image, target)
+
+        # Random color jitter
+        if random.random() < 0.5:
+            image = self._color_jitter(image)
+
+        return image, target
+
+    def _horizontal_flip(self,
+                        image: Image.Image,
+                        target: Dict) -> Tuple[Image.Image, Dict]:
+        """Apply horizontal flip"""
+
+        w, h = image.size
+        image = image.transpose(Image.FLIP_LEFT_RIGHT)
+
+        if 'boxes' in target and len(target['boxes']) > 0:
+            boxes = target['boxes'].clone() if torch.is_tensor(target['boxes']) else target['boxes'].copy()
+
+            # Flip x coordinates
+            if torch.is_tensor(boxes):
+                boxes[:, [0, 2]] = w - boxes[:, [2, 0]]
+            else:
+                boxes[:, [0, 2]] = w - boxes[:, [2, 0]]
+
+            target['boxes'] = boxes
+
+        return image, target
+
+    def _color_jitter(self, image: Image.Image) -> Image.Image:
+        """Apply color jittering"""
+
+        jitter = T.ColorJitter(
+            brightness=0.2,
+            contrast=0.2,
+            saturation=0.2,
+            hue=0.1
+        )
+
+        return jitter(image)
+
+
+def get_transform(img_size: int = 640,
+                  train: bool = False) -> Transforms:
+    """
+    Get appropriate transform for training or inference
+
+    Args:
+        img_size: Target image size
+        train: Whether this is for training (enables augmentation)
+
+    Returns:
+        Transform object
+    """
+
+    return Transforms(
+        img_size=img_size,
+        augment=train,
+        normalize=True
+    )
\ No newline at end of file
diff --git a/engine/__init__.py b/deim/_engine/__init__.py
similarity index 100%
rename from engine/__init__.py
rename to deim/_engine/__init__.py
diff --git a/deim/_engine/backbone/__init__.py b/deim/_engine/backbone/__init__.py
new file mode 100644
index 00000000..e709ff5f
--- /dev/null
+++ b/deim/_engine/backbone/__init__.py
@@ -0,0 +1,20 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from deim._engine.backbone.common import (
+    get_activation,
+    FrozenBatchNorm2d,
+    freeze_batch_norm2d,
+)
+from deim._engine.backbone.presnet import PResNet
+from deim._engine.backbone.test_resnet import MResNet
+
+from deim._engine.backbone.timm_model import TimmModel
+from deim._engine.backbone.torchvision_model import TorchVisionModel
+
+from deim._engine.backbone.csp_resnet import CSPResNet
+from deim._engine.backbone.csp_darknet import CSPDarkNet, CSPPAN
+
+from deim._engine.backbone.hgnetv2 import HGNetv2
diff --git a/engine/backbone/common.py b/deim/_engine/backbone/common.py
similarity index 100%
rename from engine/backbone/common.py
rename to deim/_engine/backbone/common.py
diff --git a/deim/_engine/backbone/csp_darknet.py b/deim/_engine/backbone/csp_darknet.py
new file mode 100644
index 00000000..c3365dac
--- /dev/null
+++ b/deim/_engine/backbone/csp_darknet.py
@@ -0,0 +1,179 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import math
+import warnings
+
+from deim._engine.backbone.common import get_activation
+from deim._engine.core import register
+
+
+def autopad(k, p=None):
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
+    return p
+
+def make_divisible(c, d):
+    return math.ceil(c / d) * d
+
+
+class Conv(nn.Module):
+    def __init__(self, cin, cout, k=1, s=1, p=None, g=1, act='silu') -> None:
+        super().__init__()
+        self.conv = nn.Conv2d(cin, cout, k, s, autopad(k, p), groups=g, bias=False)
+        self.bn = nn.BatchNorm2d(cout)
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, act='silu'):
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c_, c2, 3, 1, g=g, act=act)
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+
+class C3(nn.Module):
+    # CSP Bottleneck with 3 convolutions
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, act='silu'):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c1, c_, 1, 1, act=act)
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0, act=act) for _ in range(n)))
+        self.cv3 = Conv(2 * c_, c2, 1, act=act)
+
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
+
+
+class SPPF(nn.Module):
+    # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
+    def __init__(self, c1, c2, k=5, act='silu'):  # equivalent to SPP(k=(5, 9, 13))
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c_ * 4, c2, 1, 1, act=act)
+        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+
+    def forward(self, x):
+        x = self.cv1(x)
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warning
+            y1 = self.m(x)
+            y2 = self.m(y1)
+            return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))
+
+
+@register()
+class CSPDarkNet(nn.Module):
+    __share__ = ['depth_multi', 'width_multi']
+
+    def __init__(self, in_channels=3, width_multi=1.0, depth_multi=1.0, return_idx=[2, 3, -1], act='silu', ) -> None:
+        super().__init__()
+
+        channels = [64, 128, 256, 512, 1024]
+        channels = [make_divisible(c * width_multi, 8) for c in channels]
+
+        depths = [3, 6, 9, 3]
+        depths = [max(round(d * depth_multi), 1) for d in depths]
+
+        self.layers = nn.ModuleList([Conv(in_channels, channels[0], 6, 2, 2, act=act)])
+        for i, (c, d) in enumerate(zip(channels, depths), 1):
+            layer = nn.Sequential(*[Conv(c, channels[i], 3, 2, act=act), C3(channels[i], channels[i], n=d, act=act)])
+            self.layers.append(layer)
+
+        self.layers.append(SPPF(channels[-1], channels[-1], k=5, act=act))
+
+        self.return_idx = return_idx
+        self.out_channels = [channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32][i] for i in self.return_idx]
+        self.depths = depths
+        self.act = act
+
+    def forward(self, x):
+        outputs = []
+        for _, m in enumerate(self.layers):
+            x = m(x)
+            outputs.append(x)
+
+        return [outputs[i] for i in self.return_idx]
+
+
+@register()
+class CSPPAN(nn.Module):
+    """
+    P5 ---> 1x1  ---------------------------------> concat --> c3 --> det
+             | up                                     | conv /2
+    P4 ---> concat ---> c3 ---> 1x1  -->  concat ---> c3 -----------> det
+                                 | up       | conv /2
+    P3 -----------------------> concat ---> c3 ---------------------> det
+    """
+    __share__ = ['depth_multi', ]
+
+    def __init__(self, in_channels=[256, 512, 1024], depth_multi=1., act='silu') -> None:
+        super().__init__()
+        depth = max(round(3 * depth_multi), 1)
+
+        self.out_channels = in_channels
+        self.fpn_stems = nn.ModuleList([Conv(cin, cout, 1, 1, act=act) for cin, cout in zip(in_channels[::-1], in_channels[::-1][1:])])
+        self.fpn_csps = nn.ModuleList([C3(cin, cout, depth, False, act=act) for cin, cout in zip(in_channels[::-1], in_channels[::-1][1:])])
+
+        self.pan_stems = nn.ModuleList([Conv(c, c, 3, 2, act=act) for c in in_channels[:-1]])
+        self.pan_csps = nn.ModuleList([C3(c, c, depth, False, act=act) for c in in_channels[1:]])
+
+    def forward(self, feats):
+        fpn_feats = []
+        for i, feat in enumerate(feats[::-1]):
+            if i == 0:
+                feat = self.fpn_stems[i](feat)
+                fpn_feats.append(feat)
+            else:
+                _feat = F.interpolate(fpn_feats[-1], scale_factor=2, mode='nearest')
+                feat = torch.concat([_feat, feat], dim=1)
+                feat = self.fpn_csps[i-1](feat)
+                if i < len(self.fpn_stems):
+                    feat = self.fpn_stems[i](feat)
+                fpn_feats.append(feat)
+
+        pan_feats = []
+        for i, feat in enumerate(fpn_feats[::-1]):
+            if i == 0:
+                pan_feats.append(feat)
+            else:
+                _feat = self.pan_stems[i-1](pan_feats[-1])
+                feat = torch.concat([_feat, feat], dim=1)
+                feat = self.pan_csps[i-1](feat)
+                pan_feats.append(feat)
+
+        return pan_feats
+
+
+if __name__ == '__main__':
+
+    data = torch.rand(1, 3, 320, 640)
+
+    width_multi = 0.75
+    depth_multi = 0.33
+
+    m = CSPDarkNet(3, width_multi=width_multi, depth_multi=depth_multi, act='silu')
+    outputs = m(data)
+    print([o.shape for o in outputs])
+
+    m = CSPPAN(in_channels=m.out_channels, depth_multi=depth_multi, act='silu')
+    outputs = m(outputs)
+    print([o.shape for o in outputs])
diff --git a/engine/backbone/csp_resnet.py b/deim/_engine/backbone/csp_resnet.py
similarity index 100%
rename from engine/backbone/csp_resnet.py
rename to deim/_engine/backbone/csp_resnet.py
diff --git a/deim/_engine/backbone/hgnetv2.py b/deim/_engine/backbone/hgnetv2.py
new file mode 100644
index 00000000..45e83a78
--- /dev/null
+++ b/deim/_engine/backbone/hgnetv2.py
@@ -0,0 +1,536 @@
+"""
+reference
+- https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
+
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+from deim._engine.backbone.common import FrozenBatchNorm2d
+from deim._engine.core import register
+import logging
+
+# Constants for initialization
+kaiming_normal_ = nn.init.kaiming_normal_
+zeros_ = nn.init.zeros_
+ones_ = nn.init.ones_
+
+__all__ = ['HGNetv2']
+
+
+class LearnableAffineBlock(nn.Module):
+    def __init__(
+            self,
+            scale_value=1.0,
+            bias_value=0.0
+    ):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True)
+        self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            kernel_size,
+            stride=1,
+            groups=1,
+            padding='',
+            use_act=True,
+            use_lab=False
+    ):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        if padding == 'same':
+            self.conv = nn.Sequential(
+                nn.ZeroPad2d([0, 1, 0, 1]),
+                nn.Conv2d(
+                    in_chs,
+                    out_chs,
+                    kernel_size,
+                    stride,
+                    groups=groups,
+                    bias=False
+                )
+            )
+        else:
+            self.conv = nn.Conv2d(
+                in_chs,
+                out_chs,
+                kernel_size,
+                stride,
+                padding=(kernel_size - 1) // 2,
+                groups=groups,
+                bias=False
+            )
+        self.bn = nn.BatchNorm2d(out_chs)
+        if self.use_act:
+            self.act = nn.ReLU()
+        else:
+            self.act = nn.Identity()
+        if self.use_act and self.use_lab:
+            self.lab = LearnableAffineBlock()
+        else:
+            self.lab = nn.Identity()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            kernel_size,
+            groups=1,
+            use_lab=False,
+    ):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_chs,
+            out_chs,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+        )
+        self.conv2 = ConvBNAct(
+            out_chs,
+            out_chs,
+            kernel_size=kernel_size,
+            groups=out_chs,
+            use_act=True,
+            use_lab=use_lab,
+        )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(nn.Module):
+    # for HGNetv2
+    def __init__(self, in_chs, mid_chs, out_chs, use_lab=False):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_chs,
+            mid_chs,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+        )
+        self.stem2a = ConvBNAct(
+            mid_chs,
+            mid_chs // 2,
+            kernel_size=2,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.stem2b = ConvBNAct(
+            mid_chs // 2,
+            mid_chs,
+            kernel_size=2,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.stem3 = ConvBNAct(
+            mid_chs * 2,
+            mid_chs,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+        )
+        self.stem4 = ConvBNAct(
+            mid_chs,
+            out_chs,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True)
+
+    def forward(self, x):
+        x = self.stem1(x)
+        x = F.pad(x, (0, 1, 0, 1))
+        x2 = self.stem2a(x)
+        x2 = F.pad(x2, (0, 1, 0, 1))
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = torch.cat([x1, x2], dim=1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+        return x
+
+
+class EseModule(nn.Module):
+    def __init__(self, chs):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            chs,
+            chs,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = x.mean((2, 3), keepdim=True)
+        x = self.conv(x)
+        x = self.sigmoid(x)
+        return torch.mul(identity, x)
+
+
+class HG_Block(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            mid_chs,
+            out_chs,
+            layer_num,
+            kernel_size=3,
+            residual=False,
+            light_block=False,
+            use_lab=False,
+            agg='ese',
+            drop_path=0.,
+    ):
+        super().__init__()
+        self.residual = residual
+
+        self.layers = nn.ModuleList()
+        for i in range(layer_num):
+            if light_block:
+                self.layers.append(
+                    LightConvBNAct(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                        kernel_size=kernel_size,
+                        use_lab=use_lab,
+                    )
+                )
+            else:
+                self.layers.append(
+                    ConvBNAct(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        use_lab=use_lab,
+                    )
+                )
+
+        # feature aggregation
+        total_chs = in_chs + layer_num * mid_chs
+        if agg == 'se':
+            aggregation_squeeze_conv = ConvBNAct(
+                total_chs,
+                out_chs // 2,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            aggregation_excitation_conv = ConvBNAct(
+                out_chs // 2,
+                out_chs,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            self.aggregation = nn.Sequential(
+                aggregation_squeeze_conv,
+                aggregation_excitation_conv,
+            )
+        else:
+            aggregation_conv = ConvBNAct(
+                total_chs,
+                out_chs,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            att = EseModule(out_chs)
+            self.aggregation = nn.Sequential(
+                aggregation_conv,
+                att,
+            )
+
+        self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity()
+
+    def forward(self, x):
+        identity = x
+        output = [x]
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = torch.cat(output, dim=1)
+        x = self.aggregation(x)
+        if self.residual:
+            x = self.drop_path(x) + identity
+        return x
+
+
+class HG_Stage(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            mid_chs,
+            out_chs,
+            block_num,
+            layer_num,
+            downsample=True,
+            light_block=False,
+            kernel_size=3,
+            use_lab=False,
+            agg='se',
+            drop_path=0.,
+    ):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_chs,
+                in_chs,
+                kernel_size=3,
+                stride=2,
+                groups=in_chs,
+                use_act=False,
+                use_lab=use_lab,
+            )
+        else:
+            self.downsample = nn.Identity()
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block(
+                    in_chs if i == 0 else out_chs,
+                    mid_chs,
+                    out_chs,
+                    layer_num,
+                    residual=False if i == 0 else True,
+                    kernel_size=kernel_size,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    agg=agg,
+                    drop_path=drop_path[i] if isinstance(drop_path, (list, tuple)) else drop_path,
+                )
+            )
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+
+@register()
+class HGNetv2(nn.Module):
+    """
+    HGNetV2
+    Args:
+        stem_channels: list. Number of channels for the stem block.
+        stage_type: str. The stage configuration of HGNet. such as the number of channels, stride, etc.
+        use_lab: boolean. Whether to use LearnableAffineBlock in network.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific HGNetV2 model depends on args.
+    """
+
+    arch_configs = {
+        'B0': {
+            'stem_channels': [3, 16, 16],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [16, 16, 64, 1, False, False, 3, 3],
+                "stage2": [64, 32, 256, 1, True, False, 3, 3],
+                "stage3": [256, 64, 512, 2, True, True, 5, 3],
+                "stage4": [512, 128, 1024, 1, True, True, 5, 3],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B0_stage1.pth'
+        },
+        'B1': {
+            'stem_channels': [3, 24, 32],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 64, 1, False, False, 3, 3],
+                "stage2": [64, 48, 256, 1, True, False, 3, 3],
+                "stage3": [256, 96, 512, 2, True, True, 5, 3],
+                "stage4": [512, 192, 1024, 1, True, True, 5, 3],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B1_stage1.pth'
+        },
+        'B2': {
+            'stem_channels': [3, 24, 32],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 96, 1, False, False, 3, 4],
+                "stage2": [96, 64, 384, 1, True, False, 3, 4],
+                "stage3": [384, 128, 768, 3, True, True, 5, 4],
+                "stage4": [768, 256, 1536, 1, True, True, 5, 4],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B2_stage1.pth'
+        },
+        'B3': {
+            'stem_channels': [3, 24, 32],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 128, 1, False, False, 3, 5],
+                "stage2": [128, 64, 512, 1, True, False, 3, 5],
+                "stage3": [512, 128, 1024, 3, True, True, 5, 5],
+                "stage4": [1024, 256, 2048, 1, True, True, 5, 5],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B3_stage1.pth'
+        },
+        'B4': {
+            'stem_channels': [3, 32, 48],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B4_stage1.pth'
+        },
+        'B5': {
+            'stem_channels': [3, 32, 64],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B5_stage1.pth'
+        },
+        'B6': {
+            'stem_channels': [3, 48, 96],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [96, 96, 192, 2, False, False, 3, 6],
+                "stage2": [192, 192, 512, 3, True, False, 3, 6],
+                "stage3": [512, 384, 1024, 6, True, True, 5, 6],
+                "stage4": [1024, 768, 2048, 3, True, True, 5, 6],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B6_stage1.pth'
+        },
+    }
+
+    def __init__(self,
+                 name,
+                 use_lab=False,
+                 return_idx=[1, 2, 3],
+                 freeze_stem_only=True,
+                 freeze_at=0,
+                 freeze_norm=True,
+                 pretrained=True,
+                 local_model_dir='weight/hgnetv2/'):
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+
+        stem_channels = self.arch_configs[name]['stem_channels']
+        stage_config = self.arch_configs[name]['stage_config']
+        download_url = self.arch_configs[name]['url']
+
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+
+        # stem
+        self.stem = StemBlock(
+                in_chs=stem_channels[0],
+                mid_chs=stem_channels[1],
+                out_chs=stem_channels[2],
+                use_lab=use_lab)
+
+        # stages
+        self.stages = nn.ModuleList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
+                k]
+            self.stages.append(
+                HG_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab))
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+
+        if freeze_norm:
+            self._freeze_norm(self)
+
+        if pretrained:
+            RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
+            try:
+                model_path = os.path.join(local_model_dir, 'PPHGNetV2_' + name + '_stage1.pth')
+                if os.path.exists(model_path):
+                    state = torch.load(model_path, map_location='cpu')
+                    print(f"Loaded stage1 {name} HGNetV2 from local file.")
+                else:
+                    # If the file doesn't exist locally, download from the URL
+                    print(GREEN + "If the pretrained HGNetV2 can't be downloaded automatically. Please check your network connection." + RESET)
+                    print(GREEN + "Please check your network connection. Or download the model manually from " + RESET + f"{download_url}" + GREEN + " to " + RESET + f"{local_model_dir}." + RESET)
+                    state = torch.hub.load_state_dict_from_url(download_url, map_location='cpu', model_dir=local_model_dir)
+                    print(f"Loaded stage1 {name} HGNetV2 from URL.")
+
+                self.load_state_dict(state)
+
+            except (Exception, KeyboardInterrupt) as e:
+                print(f"{str(e)}")
+                logging.error(RED + "CRITICAL WARNING: Failed to load pretrained HGNetV2 model" + RESET)
+                logging.error(GREEN + "Please check your network connection. Or download the model manually from " \
+                            + RESET + f"{download_url}" + GREEN + " to " + RESET + f"{local_model_dir}." + RESET)
+                exit()
+
+
+
+
+    def _freeze_norm(self, m: nn.Module):
+        if isinstance(m, nn.BatchNorm2d):
+            m = FrozenBatchNorm2d(m.num_features)
+        else:
+            for name, child in m.named_children():
+                _child = self._freeze_norm(child)
+                if _child is not child:
+                    setattr(m, name, _child)
+        return m
+
+    def _freeze_parameters(self, m: nn.Module):
+        for p in m.parameters():
+            p.requires_grad = False
+
+    def forward(self, x):
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
diff --git a/deim/_engine/backbone/presnet.py b/deim/_engine/backbone/presnet.py
new file mode 100644
index 00000000..10af8d28
--- /dev/null
+++ b/deim/_engine/backbone/presnet.py
@@ -0,0 +1,259 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from collections import OrderedDict
+
+from deim._engine.backbone.common import get_activation, FrozenBatchNorm2d
+
+from deim._engine.core import register
+import os
+
+
+__all__ = ['PResNet']
+
+
+ResNet_cfg = {
+    18: [2, 2, 2, 2],
+    34: [3, 4, 6, 3],
+    50: [3, 4, 6, 3],
+    101: [3, 4, 23, 3],
+    # 152: [3, 8, 36, 3],
+}
+
+
+donwload_url = {
+    18: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet18_vd_pretrained_from_paddle.pth',
+    34: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet34_vd_pretrained_from_paddle.pth',
+    50: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet50_vd_ssld_v2_pretrained_from_paddle.pth',
+    101: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet101_vd_ssld_pretrained_from_paddle.pth',
+}
+
+local_weights = {
+    18: "ResNet18_vd_pretrained_from_paddle.pth",
+    34: "ResNet34_vd_pretrained_from_paddle.pth",
+    50: "ResNet50_vd_ssld_v2_pretrained_from_paddle.pth",
+    101: "ResNet101_vd_ssld_pretrained_from_paddle.pth"
+}
+
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size,
+            stride,
+            padding=(kernel_size-1)//2 if padding is None else padding,
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = get_activation(act)
+
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, ch_in, ch_out, stride, shortcut, act='relu', variant='b'):
+        super().__init__()
+
+        self.shortcut = shortcut
+
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential(OrderedDict([
+                    ('pool', nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
+                    ('conv', ConvNormLayer(ch_in, ch_out, 1, 1))
+                ]))
+            else:
+                self.short = ConvNormLayer(ch_in, ch_out, 1, stride)
+
+        self.branch2a = ConvNormLayer(ch_in, ch_out, 3, stride, act=act)
+        self.branch2b = ConvNormLayer(ch_out, ch_out, 3, 1, act=None)
+        self.act = nn.Identity() if act is None else get_activation(act)
+
+
+    def forward(self, x):
+        out = self.branch2a(x)
+        out = self.branch2b(out)
+        if self.shortcut:
+            short = x
+        else:
+            short = self.short(x)
+
+        out = out + short
+        out = self.act(out)
+
+        return out
+
+
+class BottleNeck(nn.Module):
+    expansion = 4
+
+    def __init__(self, ch_in, ch_out, stride, shortcut, act='relu', variant='b'):
+        super().__init__()
+
+        if variant == 'a':
+            stride1, stride2 = stride, 1
+        else:
+            stride1, stride2 = 1, stride
+
+        width = ch_out
+
+        self.branch2a = ConvNormLayer(ch_in, width, 1, stride1, act=act)
+        self.branch2b = ConvNormLayer(width, width, 3, stride2, act=act)
+        self.branch2c = ConvNormLayer(width, ch_out * self.expansion, 1, 1)
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential(OrderedDict([
+                    ('pool', nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
+                    ('conv', ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1))
+                ]))
+            else:
+                self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
+
+        self.act = nn.Identity() if act is None else get_activation(act)
+
+    def forward(self, x):
+        out = self.branch2a(x)
+        out = self.branch2b(out)
+        out = self.branch2c(out)
+
+        if self.shortcut:
+            short = x
+        else:
+            short = self.short(x)
+
+        out = out + short
+        out = self.act(out)
+
+        return out
+
+
+class Blocks(nn.Module):
+    def __init__(self, block, ch_in, ch_out, count, stage_num, act='relu', variant='b'):
+        super().__init__()
+
+        self.blocks = nn.ModuleList()
+        for i in range(count):
+            self.blocks.append(
+                block(
+                    ch_in,
+                    ch_out,
+                    stride=2 if i == 0 and stage_num != 2 else 1,
+                    shortcut=False if i == 0 else True,
+                    variant=variant,
+                    act=act)
+            )
+
+            if i == 0:
+                ch_in = ch_out * block.expansion
+
+    def forward(self, x):
+        out = x
+        for block in self.blocks:
+            out = block(out)
+        return out
+
+
+@register()
+class PResNet(nn.Module):
+    def __init__(
+        self,
+        depth,
+        variant='d',
+        num_stages=4,
+        return_idx=[0, 1, 2, 3],
+        act='relu',
+        freeze_at=-1,
+        freeze_norm=True,
+        pretrained=False,
+        local_model_dir='weights/resnets',
+        ):
+        super().__init__()
+
+        block_nums = ResNet_cfg[depth]
+        ch_in = 64
+        if variant in ['c', 'd']:
+            conv_def = [
+                [3, ch_in // 2, 3, 2, "conv1_1"],
+                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
+                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
+            ]
+        else:
+            conv_def = [[3, ch_in, 7, 2, "conv1_1"]]
+
+        self.conv1 = nn.Sequential(OrderedDict([
+            (name, ConvNormLayer(cin, cout, k, s, act=act)) for cin, cout, k, s, name in conv_def
+        ]))
+
+        ch_out_list = [64, 128, 256, 512]
+        block = BottleNeck if depth >= 50 else BasicBlock
+
+        _out_channels = [block.expansion * v for v in ch_out_list]
+        _out_strides = [4, 8, 16, 32]
+
+        self.res_layers = nn.ModuleList()
+        for i in range(num_stages):
+            stage_num = i + 2
+            self.res_layers.append(
+                Blocks(block, ch_in, ch_out_list[i], block_nums[i], stage_num, act=act, variant=variant)
+            )
+            ch_in = _out_channels[i]
+
+        self.return_idx = return_idx
+        self.out_channels = [_out_channels[_i] for _i in return_idx]
+        self.out_strides = [_out_strides[_i] for _i in return_idx]
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.conv1)
+            for i in range(min(freeze_at, num_stages)):
+                self._freeze_parameters(self.res_layers[i])
+
+        if freeze_norm:
+            self._freeze_norm(self)
+
+        if pretrained:
+            model_path = local_model_dir + local_weights[depth]
+            if os.path.exists(model_path):
+                state = torch.load(model_path, map_location='cpu')
+                print(f"Loaded PResNet{depth} from local file@{model_path}.")
+            else:
+                if isinstance(pretrained, bool) or 'http' in pretrained:
+                    state = torch.hub.load_state_dict_from_url(donwload_url[depth], map_location='cpu', model_dir=local_model_dir)
+                else:
+                    state = torch.load(pretrained, map_location='cpu')
+            self.load_state_dict(state)
+            print(f'Load PResNet{depth} state_dict')
+
+    def _freeze_parameters(self, m: nn.Module):
+        for p in m.parameters():
+            p.requires_grad = False
+
+    def _freeze_norm(self, m: nn.Module):
+        if isinstance(m, nn.BatchNorm2d):
+            m = FrozenBatchNorm2d(m.num_features)
+        else:
+            for name, child in m.named_children():
+                _child = self._freeze_norm(child)
+                if _child is not child:
+                    setattr(m, name, _child)
+        return m
+
+    def forward(self, x):
+        conv1 = self.conv1(x)
+        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
+        outs = []
+        for idx, stage in enumerate(self.res_layers):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
diff --git a/deim/_engine/backbone/test_resnet.py b/deim/_engine/backbone/test_resnet.py
new file mode 100644
index 00000000..58b8ebe1
--- /dev/null
+++ b/deim/_engine/backbone/test_resnet.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from collections import OrderedDict
+
+
+from deim._engine.core import register
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes,kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+
+class _ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super().__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+
+        self.linear = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+@register()
+class MResNet(nn.Module):
+    def __init__(self, num_classes=10, num_blocks=[2, 2, 2, 2]) -> None:
+        super().__init__()
+        self.model = _ResNet(BasicBlock, num_blocks, num_classes)
+
+    def forward(self, x):
+        return self.model(x)
diff --git a/deim/_engine/backbone/timm_model.py b/deim/_engine/backbone/timm_model.py
new file mode 100644
index 00000000..926b884b
--- /dev/null
+++ b/deim/_engine/backbone/timm_model.py
@@ -0,0 +1,69 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+
+https://towardsdatascience.com/getting-started-with-pytorch-image-models-timm-a-practitioners-guide-4e77b4bf9055#0583
+"""
+import torch
+from torchvision.models.feature_extraction import get_graph_node_names, create_feature_extractor
+
+from deim._engine.backbone.utils import IntermediateLayerGetter
+from deim._engine.core import register
+
+
+@register()
+class TimmModel(torch.nn.Module):
+    def __init__(self, \
+        name,
+        return_layers,
+        pretrained=False,
+        exportable=True,
+        features_only=True,
+        **kwargs) -> None:
+
+        super().__init__()
+
+        import timm
+        model = timm.create_model(
+            name,
+            pretrained=pretrained,
+            exportable=exportable,
+            features_only=features_only,
+            **kwargs
+        )
+        # nodes, _ = get_graph_node_names(model)
+        # print(nodes)
+        # features = {'': ''}
+        # model = create_feature_extractor(model, return_nodes=features)
+
+        assert set(return_layers).issubset(model.feature_info.module_name()), \
+            f'return_layers should be a subset of {model.feature_info.module_name()}'
+
+        # self.model = model
+        self.model = IntermediateLayerGetter(model, return_layers)
+
+        return_idx = [model.feature_info.module_name().index(name) for name in return_layers]
+        self.strides = [model.feature_info.reduction()[i] for i in return_idx]
+        self.channels = [model.feature_info.channels()[i] for i in return_idx]
+        self.return_idx = return_idx
+        self.return_layers = return_layers
+
+    def forward(self, x: torch.Tensor):
+        outputs = self.model(x)
+        # outputs = [outputs[i] for i in self.return_idx]
+        return outputs
+
+
+if __name__ == '__main__':
+
+    model = TimmModel(name='resnet34', return_layers=['layer2', 'layer3'])
+    data = torch.rand(1, 3, 640, 640)
+    outputs = model(data)
+
+    for output in outputs:
+        print(output.shape)
+
+    """
+    model:
+        type: TimmModel
+        name: resnet34
+        return_layers: ['layer2', 'layer4']
+    """
diff --git a/deim/_engine/backbone/torchvision_model.py b/deim/_engine/backbone/torchvision_model.py
new file mode 100644
index 00000000..ba786570
--- /dev/null
+++ b/deim/_engine/backbone/torchvision_model.py
@@ -0,0 +1,49 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torchvision
+
+from deim._engine.core import register
+from deim._engine.backbone.utils import IntermediateLayerGetter
+
+__all__ = ['TorchVisionModel']
+
+@register()
+class TorchVisionModel(torch.nn.Module):
+    def __init__(self, name, return_layers, weights=None, **kwargs) -> None:
+        super().__init__()
+
+        if weights is not None:
+            weights = getattr(torchvision.models.get_model_weights(name), weights)
+
+        model = torchvision.models.get_model(name, weights=weights, **kwargs)
+
+        # TODO hard code.
+        if hasattr(model, 'features'):
+            model = IntermediateLayerGetter(model.features, return_layers)
+        else:
+            model = IntermediateLayerGetter(model, return_layers)
+
+        self.model = model
+
+    def forward(self, x):
+        return self.model(x)
+
+
+# TorchVisionModel('swin_t', return_layers=['5', '7'])
+# TorchVisionModel('resnet34', return_layers=['layer2','layer3', 'layer4'])
+
+# TorchVisionModel:
+#     name: swin_t
+#     return_layers: ['5', '7']
+#     weights: DEFAULT
+
+
+# model:
+#     type: TorchVisionModel
+#     name: resnet34
+#     return_layers: ['layer2','layer3', 'layer4']
+#     weights: DEFAULT
diff --git a/engine/backbone/utils.py b/deim/_engine/backbone/utils.py
similarity index 100%
rename from engine/backbone/utils.py
rename to deim/_engine/backbone/utils.py
diff --git a/deim/_engine/base_model.pth b/deim/_engine/base_model.pth
new file mode 100644
index 00000000..06d56b13
Binary files /dev/null and b/deim/_engine/base_model.pth differ
diff --git a/configs/base/dataloader.yml b/deim/_engine/configs/base/dataloader.yml
similarity index 100%
rename from configs/base/dataloader.yml
rename to deim/_engine/configs/base/dataloader.yml
diff --git a/configs/base/deim.yml b/deim/_engine/configs/base/deim.yml
similarity index 100%
rename from configs/base/deim.yml
rename to deim/_engine/configs/base/deim.yml
diff --git a/configs/base/dfine_hgnetv2.yml b/deim/_engine/configs/base/dfine_hgnetv2.yml
similarity index 100%
rename from configs/base/dfine_hgnetv2.yml
rename to deim/_engine/configs/base/dfine_hgnetv2.yml
diff --git a/configs/base/optimizer.yml b/deim/_engine/configs/base/optimizer.yml
similarity index 100%
rename from configs/base/optimizer.yml
rename to deim/_engine/configs/base/optimizer.yml
diff --git a/configs/base/rt_deim.yml b/deim/_engine/configs/base/rt_deim.yml
similarity index 100%
rename from configs/base/rt_deim.yml
rename to deim/_engine/configs/base/rt_deim.yml
diff --git a/configs/base/rt_optimizer.yml b/deim/_engine/configs/base/rt_optimizer.yml
similarity index 100%
rename from configs/base/rt_optimizer.yml
rename to deim/_engine/configs/base/rt_optimizer.yml
diff --git a/configs/base/rtdetrv2_r50vd.yml b/deim/_engine/configs/base/rtdetrv2_r50vd.yml
similarity index 100%
rename from configs/base/rtdetrv2_r50vd.yml
rename to deim/_engine/configs/base/rtdetrv2_r50vd.yml
diff --git a/configs/dataset/coco_detection.yml b/deim/_engine/configs/dataset/coco_detection.yml
similarity index 100%
rename from configs/dataset/coco_detection.yml
rename to deim/_engine/configs/dataset/coco_detection.yml
diff --git a/configs/dataset/crowdhuman_detection.yml b/deim/_engine/configs/dataset/crowdhuman_detection.yml
similarity index 100%
rename from configs/dataset/crowdhuman_detection.yml
rename to deim/_engine/configs/dataset/crowdhuman_detection.yml
diff --git a/configs/dataset/custom_detection.yml b/deim/_engine/configs/dataset/custom_detection.yml
similarity index 100%
rename from configs/dataset/custom_detection.yml
rename to deim/_engine/configs/dataset/custom_detection.yml
diff --git a/configs/dataset/obj365_detection.yml b/deim/_engine/configs/dataset/obj365_detection.yml
similarity index 100%
rename from configs/dataset/obj365_detection.yml
rename to deim/_engine/configs/dataset/obj365_detection.yml
diff --git a/configs/dataset/voc_detection.yml b/deim/_engine/configs/dataset/voc_detection.yml
similarity index 100%
rename from configs/dataset/voc_detection.yml
rename to deim/_engine/configs/dataset/voc_detection.yml
diff --git a/configs/deim_dfine/deim_hgnetv2_l_coco.yml b/deim/_engine/configs/deim_dfine/deim_hgnetv2_l_coco.yml
similarity index 100%
rename from configs/deim_dfine/deim_hgnetv2_l_coco.yml
rename to deim/_engine/configs/deim_dfine/deim_hgnetv2_l_coco.yml
diff --git a/configs/deim_dfine/deim_hgnetv2_m_coco.yml b/deim/_engine/configs/deim_dfine/deim_hgnetv2_m_coco.yml
similarity index 100%
rename from configs/deim_dfine/deim_hgnetv2_m_coco.yml
rename to deim/_engine/configs/deim_dfine/deim_hgnetv2_m_coco.yml
diff --git a/configs/deim_dfine/deim_hgnetv2_n_coco.yml b/deim/_engine/configs/deim_dfine/deim_hgnetv2_n_coco.yml
similarity index 100%
rename from configs/deim_dfine/deim_hgnetv2_n_coco.yml
rename to deim/_engine/configs/deim_dfine/deim_hgnetv2_n_coco.yml
diff --git a/configs/deim_dfine/deim_hgnetv2_s_coco.yml b/deim/_engine/configs/deim_dfine/deim_hgnetv2_s_coco.yml
similarity index 100%
rename from configs/deim_dfine/deim_hgnetv2_s_coco.yml
rename to deim/_engine/configs/deim_dfine/deim_hgnetv2_s_coco.yml
diff --git a/configs/deim_dfine/deim_hgnetv2_x_coco.yml b/deim/_engine/configs/deim_dfine/deim_hgnetv2_x_coco.yml
similarity index 100%
rename from configs/deim_dfine/deim_hgnetv2_x_coco.yml
rename to deim/_engine/configs/deim_dfine/deim_hgnetv2_x_coco.yml
diff --git a/configs/deim_dfine/dfine_hgnetv2_l_coco.yml b/deim/_engine/configs/deim_dfine/dfine_hgnetv2_l_coco.yml
similarity index 100%
rename from configs/deim_dfine/dfine_hgnetv2_l_coco.yml
rename to deim/_engine/configs/deim_dfine/dfine_hgnetv2_l_coco.yml
diff --git a/configs/deim_dfine/dfine_hgnetv2_m_coco.yml b/deim/_engine/configs/deim_dfine/dfine_hgnetv2_m_coco.yml
similarity index 100%
rename from configs/deim_dfine/dfine_hgnetv2_m_coco.yml
rename to deim/_engine/configs/deim_dfine/dfine_hgnetv2_m_coco.yml
diff --git a/configs/deim_dfine/dfine_hgnetv2_n_coco.yml b/deim/_engine/configs/deim_dfine/dfine_hgnetv2_n_coco.yml
similarity index 100%
rename from configs/deim_dfine/dfine_hgnetv2_n_coco.yml
rename to deim/_engine/configs/deim_dfine/dfine_hgnetv2_n_coco.yml
diff --git a/configs/deim_dfine/dfine_hgnetv2_s_coco.yml b/deim/_engine/configs/deim_dfine/dfine_hgnetv2_s_coco.yml
similarity index 100%
rename from configs/deim_dfine/dfine_hgnetv2_s_coco.yml
rename to deim/_engine/configs/deim_dfine/dfine_hgnetv2_s_coco.yml
diff --git a/configs/deim_dfine/dfine_hgnetv2_x_coco.yml b/deim/_engine/configs/deim_dfine/dfine_hgnetv2_x_coco.yml
similarity index 100%
rename from configs/deim_dfine/dfine_hgnetv2_x_coco.yml
rename to deim/_engine/configs/deim_dfine/dfine_hgnetv2_x_coco.yml
diff --git a/configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml b/deim/_engine/configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml
similarity index 100%
rename from configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml
rename to deim/_engine/configs/deim_dfine/object365/deim_hgnetv2_x_obj2coco_24e.yml
diff --git a/configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml b/deim/_engine/configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml
similarity index 100%
rename from configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml
rename to deim/_engine/configs/deim_dfine/object365/dfine_hgnetv2_x_obj2coco.yml
diff --git a/configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml b/deim/_engine/configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml
similarity index 100%
rename from configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml
rename to deim/_engine/configs/deim_rtdetrv2/deim_r101vd_60e_coco.yml
diff --git a/configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml b/deim/_engine/configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml
similarity index 100%
rename from configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml
rename to deim/_engine/configs/deim_rtdetrv2/deim_r18vd_120e_coco.yml
diff --git a/configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml b/deim/_engine/configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml
similarity index 100%
rename from configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml
rename to deim/_engine/configs/deim_rtdetrv2/deim_r34vd_120e_coco.yml
diff --git a/configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml b/deim/_engine/configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml
similarity index 100%
rename from configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml
rename to deim/_engine/configs/deim_rtdetrv2/deim_r50vd_60e_coco.yml
diff --git a/configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml b/deim/_engine/configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml
similarity index 100%
rename from configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml
rename to deim/_engine/configs/deim_rtdetrv2/deim_r50vd_m_60e_coco.yml
diff --git a/configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml b/deim/_engine/configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml
similarity index 100%
rename from configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml
rename to deim/_engine/configs/deim_rtdetrv2/rtdetrv2_r101vd_6x_coco.yml
diff --git a/configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml b/deim/_engine/configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml
similarity index 100%
rename from configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml
rename to deim/_engine/configs/deim_rtdetrv2/rtdetrv2_r18vd_120e_coco.yml
diff --git a/configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml b/deim/_engine/configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml
similarity index 100%
rename from configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml
rename to deim/_engine/configs/deim_rtdetrv2/rtdetrv2_r34vd_120e_coco.yml
diff --git a/configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml b/deim/_engine/configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml
similarity index 100%
rename from configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml
rename to deim/_engine/configs/deim_rtdetrv2/rtdetrv2_r50vd_6x_coco.yml
diff --git a/configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml b/deim/_engine/configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml
similarity index 100%
rename from configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml
rename to deim/_engine/configs/deim_rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml
diff --git a/configs/runtime.yml b/deim/_engine/configs/runtime.yml
similarity index 100%
rename from configs/runtime.yml
rename to deim/_engine/configs/runtime.yml
diff --git a/deim/_engine/core/__init__.py b/deim/_engine/core/__init__.py
new file mode 100644
index 00000000..8a710c0c
--- /dev/null
+++ b/deim/_engine/core/__init__.py
@@ -0,0 +1,9 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from deim._engine.core.workspace import GLOBAL_CONFIG, register, create
+from deim._engine.core.yaml_utils import *
+from deim._engine.core._config import BaseConfig
+from deim._engine.core.yaml_config import YAMLConfig
diff --git a/deim/_engine/core/_config.py b/deim/_engine/core/_config.py
new file mode 100644
index 00000000..82555f86
--- /dev/null
+++ b/deim/_engine/core/_config.py
@@ -0,0 +1,299 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
+from torch.cuda.amp.grad_scaler import GradScaler
+from torch.utils.tensorboard import SummaryWriter
+
+from pathlib import Path
+from typing import Callable, List, Dict
+
+
+__all__ = ['BaseConfig', ]
+
+
+class BaseConfig(object):
+    # TODO property
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.task :str = None
+
+        # instance / function
+        self._model :nn.Module = None
+        self._postprocessor :nn.Module = None
+        self._criterion :nn.Module = None
+        self._optimizer :Optimizer = None
+        self._lr_scheduler :LRScheduler = None
+        self._lr_warmup_scheduler: LRScheduler = None
+        self._train_dataloader :DataLoader = None
+        self._val_dataloader :DataLoader = None
+        self._ema :nn.Module = None
+        self._scaler :GradScaler = None
+        self._train_dataset :Dataset = None
+        self._val_dataset :Dataset = None
+        self._collate_fn :Callable = None
+        self._evaluator :Callable[[nn.Module, DataLoader, str], ] = None
+        self._writer: SummaryWriter = None
+
+        # dataset
+        self.num_workers :int = 0
+        self.batch_size :int = None
+        self._train_batch_size :int = None
+        self._val_batch_size :int = None
+        self._train_shuffle: bool = None
+        self._val_shuffle: bool = None
+
+        # runtime
+        self.resume :str = None
+        self.tuning :str = None
+
+        self.epoches :int = None
+        self.last_epoch :int = -1
+
+        # new_add for support self-defined cosine FIXME
+        self.lrsheduler: str = None
+        self.lr_gamma: float = None
+        self.no_aug_epoch: int = None
+        self.warmup_iter: int = None
+        self.flat_epoch: int = None
+        
+        self.use_amp :bool = False
+        self.use_ema :bool = False
+        self.ema_decay :float = 0.9999
+        self.ema_warmups: int = 2000
+        self.sync_bn :bool = False
+        self.clip_max_norm : float = 0.
+        self.find_unused_parameters :bool = None
+
+        self.seed :int = None
+        self.print_freq :int = None
+        self.checkpoint_freq :int = 1
+        self.output_dir :str = None
+        self.summary_dir :str = None
+        self.device : str = ''
+
+    @property
+    def model(self, ) -> nn.Module:
+        return self._model
+
+    @model.setter
+    def model(self, m):
+        assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class'
+        self._model = m
+
+    @property
+    def postprocessor(self, ) -> nn.Module:
+        return self._postprocessor
+
+    @postprocessor.setter
+    def postprocessor(self, m):
+        assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class'
+        self._postprocessor = m
+
+    @property
+    def criterion(self, ) -> nn.Module:
+        return self._criterion
+
+    @criterion.setter
+    def criterion(self, m):
+        assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class'
+        self._criterion = m
+
+    @property
+    def optimizer(self, ) -> Optimizer:
+        return self._optimizer
+
+    @optimizer.setter
+    def optimizer(self, m):
+        assert isinstance(m, Optimizer), f'{type(m)} != optim.Optimizer, please check your model class'
+        self._optimizer = m
+
+    @property
+    def lr_scheduler(self, ) -> LRScheduler:
+        return self._lr_scheduler
+
+    @lr_scheduler.setter
+    def lr_scheduler(self, m):
+        assert isinstance(m, LRScheduler), f'{type(m)} != LRScheduler, please check your model class'
+        self._lr_scheduler = m
+
+    @property
+    def lr_warmup_scheduler(self, ) -> LRScheduler:
+        return self._lr_warmup_scheduler
+
+    @lr_warmup_scheduler.setter
+    def lr_warmup_scheduler(self, m):
+        self._lr_warmup_scheduler = m
+
+    @property
+    def train_dataloader(self) -> DataLoader:
+        if self._train_dataloader is None and self.train_dataset is not None:
+            loader = DataLoader(self.train_dataset,
+                                batch_size=self.train_batch_size,
+                                num_workers=self.num_workers,
+                                collate_fn=self.collate_fn,
+                                shuffle=self.train_shuffle, )
+            loader.shuffle = self.train_shuffle
+            self._train_dataloader = loader
+
+        return self._train_dataloader
+
+    @train_dataloader.setter
+    def train_dataloader(self, loader):
+        self._train_dataloader = loader
+
+    @property
+    def val_dataloader(self) -> DataLoader:
+        if self._val_dataloader is None and self.val_dataset is not None:
+            loader = DataLoader(self.val_dataset,
+                                batch_size=self.val_batch_size,
+                                num_workers=self.num_workers,
+                                drop_last=False,
+                                collate_fn=self.collate_fn,
+                                shuffle=self.val_shuffle,
+                                persistent_workers=True)
+            loader.shuffle = self.val_shuffle
+            self._val_dataloader = loader
+
+        return self._val_dataloader
+
+    @val_dataloader.setter
+    def val_dataloader(self, loader):
+        self._val_dataloader = loader
+
+    @property
+    def ema(self, ) -> nn.Module:
+        if self._ema is None and self.use_ema and self.model is not None:
+            from deim._engine.optim import ModelEMA
+            self._ema = ModelEMA(self.model, self.ema_decay, self.ema_warmups)
+        return self._ema
+
+    @ema.setter
+    def ema(self, obj):
+        self._ema = obj
+
+    @property
+    def scaler(self) -> GradScaler:
+        if self._scaler is None and self.use_amp and torch.cuda.is_available():
+            self._scaler = GradScaler()
+        return self._scaler
+
+    @scaler.setter
+    def scaler(self, obj: GradScaler):
+        self._scaler = obj
+
+    @property
+    def val_shuffle(self) -> bool:
+        if self._val_shuffle is None:
+            print('warning: set default val_shuffle=False')
+            return False
+        return self._val_shuffle
+
+    @val_shuffle.setter
+    def val_shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), 'shuffle must be bool'
+        self._val_shuffle = shuffle
+
+    @property
+    def train_shuffle(self) -> bool:
+        if self._train_shuffle is None:
+            print('warning: set default train_shuffle=True')
+            return True
+        return self._train_shuffle
+
+    @train_shuffle.setter
+    def train_shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), 'shuffle must be bool'
+        self._train_shuffle = shuffle
+
+
+    @property
+    def train_batch_size(self) -> int:
+        if self._train_batch_size is None and isinstance(self.batch_size, int):
+            print(f'warning: set train_batch_size=batch_size={self.batch_size}')
+            return self.batch_size
+        return self._train_batch_size
+
+    @train_batch_size.setter
+    def train_batch_size(self, batch_size):
+        assert isinstance(batch_size, int), 'batch_size must be int'
+        self._train_batch_size = batch_size
+
+    @property
+    def val_batch_size(self) -> int:
+        if self._val_batch_size is None:
+            print(f'warning: set val_batch_size=batch_size={self.batch_size}')
+            return self.batch_size
+        return self._val_batch_size
+
+    @val_batch_size.setter
+    def val_batch_size(self, batch_size):
+        assert isinstance(batch_size, int), 'batch_size must be int'
+        self._val_batch_size = batch_size
+
+
+    @property
+    def train_dataset(self) -> Dataset:
+        return self._train_dataset
+
+    @train_dataset.setter
+    def train_dataset(self, dataset):
+        assert isinstance(dataset, Dataset), f'{type(dataset)} must be Dataset'
+        self._train_dataset = dataset
+
+
+    @property
+    def val_dataset(self) -> Dataset:
+        return self._val_dataset
+
+    @val_dataset.setter
+    def val_dataset(self, dataset):
+        assert isinstance(dataset, Dataset), f'{type(dataset)} must be Dataset'
+        self._val_dataset = dataset
+
+    @property
+    def collate_fn(self) -> Callable:
+        return self._collate_fn
+
+    @collate_fn.setter
+    def collate_fn(self, fn):
+        assert isinstance(fn, Callable), f'{type(fn)} must be Callable'
+        self._collate_fn = fn
+
+    @property
+    def evaluator(self) -> Callable:
+        return self._evaluator
+
+    @evaluator.setter
+    def evaluator(self, fn):
+        assert isinstance(fn, Callable), f'{type(fn)} must be Callable'
+        self._evaluator = fn
+
+    @property
+    def writer(self) -> SummaryWriter:
+        if self._writer is None:
+            if self.summary_dir:
+                self._writer = SummaryWriter(self.summary_dir)
+            elif self.output_dir:
+                self._writer = SummaryWriter(Path(self.output_dir) / 'summary')
+        return self._writer
+
+    @writer.setter
+    def writer(self, m):
+        assert isinstance(m, SummaryWriter), f'{type(m)} must be SummaryWriter'
+        self._writer = m
+
+    def __repr__(self, ):
+        s = ''
+        for k, v in self.__dict__.items():
+            if not k.startswith('_'):
+                s +=  f'{k}: {v}\n'
+        return s
diff --git a/engine/core/workspace.py b/deim/_engine/core/workspace.py
similarity index 100%
rename from engine/core/workspace.py
rename to deim/_engine/core/workspace.py
diff --git a/deim/_engine/core/yaml_config.py b/deim/_engine/core/yaml_config.py
new file mode 100644
index 00000000..2f063631
--- /dev/null
+++ b/deim/_engine/core/yaml_config.py
@@ -0,0 +1,176 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from datetime import datetime as dt
+import re
+import copy
+from pathlib import Path
+
+from deim._engine.core._config import BaseConfig
+from deim._engine.core.workspace import create
+from deim._engine.core.yaml_utils import load_config, merge_config, merge_dict
+
+class YAMLConfig(BaseConfig):
+    def __init__(self, cfg_path: str, **kwargs) -> None:
+        super().__init__()
+
+        cfg = load_config(cfg_path)
+        cfg = merge_dict(cfg, kwargs)
+        ts = dt.now().strftime("%Y%m%d_%H%M%S")
+        cfg['output_dir'] = f"{cfg['output_dir']}/{ts}"
+        self.yaml_cfg = copy.deepcopy(cfg)
+
+        for k in super().__dict__:
+            if not k.startswith('_') and k in cfg:
+                self.__dict__[k] = cfg[k]
+
+    @property
+    def global_cfg(self, ):
+        return merge_config(self.yaml_cfg, inplace=False, overwrite=False)
+
+    @property
+    def model(self, ) -> torch.nn.Module:
+        if self._model is None and 'model' in self.yaml_cfg:
+            self._model = create(self.yaml_cfg['model'], self.global_cfg)
+        return super().model
+
+    @property
+    def postprocessor(self, ) -> torch.nn.Module:
+        if self._postprocessor is None and 'postprocessor' in self.yaml_cfg:
+            self._postprocessor = create(self.yaml_cfg['postprocessor'], self.global_cfg)
+        return super().postprocessor
+
+    @property
+    def criterion(self, ) -> torch.nn.Module:
+        if self._criterion is None and 'criterion' in self.yaml_cfg:
+            self._criterion = create(self.yaml_cfg['criterion'], self.global_cfg)
+        return super().criterion
+
+    @property
+    def optimizer(self, ) -> optim.Optimizer:
+        if self._optimizer is None and 'optimizer' in self.yaml_cfg:
+            params = self.get_optim_params(self.yaml_cfg['optimizer'], self.model)
+            self._optimizer = create('optimizer', self.global_cfg, params=params)
+        return super().optimizer
+
+    @property
+    def lr_scheduler(self, ) -> optim.lr_scheduler.LRScheduler:
+        if self._lr_scheduler is None and 'lr_scheduler' in self.yaml_cfg:
+            self._lr_scheduler = create('lr_scheduler', self.global_cfg, optimizer=self.optimizer)
+            print(f'Initial lr: {self._lr_scheduler.get_last_lr()}')
+        return super().lr_scheduler
+
+    @property
+    def lr_warmup_scheduler(self, ) -> optim.lr_scheduler.LRScheduler:
+        if self._lr_warmup_scheduler is None and 'lr_warmup_scheduler' in self.yaml_cfg :
+            self._lr_warmup_scheduler = create('lr_warmup_scheduler', self.global_cfg, lr_scheduler=self.lr_scheduler)
+        return super().lr_warmup_scheduler
+
+    @property
+    def train_dataloader(self, ) -> DataLoader:
+        if self._train_dataloader is None and 'train_dataloader' in self.yaml_cfg:
+            self._train_dataloader = self.build_dataloader('train_dataloader')
+        return super().train_dataloader
+
+    @property
+    def val_dataloader(self, ) -> DataLoader:
+        if self._val_dataloader is None and 'val_dataloader' in self.yaml_cfg:
+            self._val_dataloader = self.build_dataloader('val_dataloader')
+        return super().val_dataloader
+
+    @property
+    def ema(self, ) -> torch.nn.Module:
+        if self._ema is None and self.yaml_cfg.get('use_ema', False):
+            self._ema = create('ema', self.global_cfg, model=self.model)
+        return super().ema
+
+    @property
+    def scaler(self, ):
+        if self._scaler is None and self.yaml_cfg.get('use_amp', False):
+            self._scaler = create('scaler', self.global_cfg)
+        return super().scaler
+
+    @property
+    def evaluator(self, ):
+        if self._evaluator is None and 'evaluator' in self.yaml_cfg:
+            if self.yaml_cfg['evaluator']['type'] == 'CocoEvaluator':
+                from deim._engine.data import get_coco_api_from_dataset
+                base_ds = get_coco_api_from_dataset(self.val_dataloader.dataset)
+                self._evaluator = create('evaluator', self.global_cfg, coco_gt=base_ds)
+            else:
+                raise NotImplementedError(f"{self.yaml_cfg['evaluator']['type']}")
+        return super().evaluator
+
+    @staticmethod
+    def get_optim_params(cfg: dict, model: nn.Module):
+        """
+        E.g.:
+            ^(?=.*a)(?=.*b).*$  means including a and b
+            ^(?=.*(?:a|b)).*$   means including a or b
+            ^(?=.*a)(?!.*b).*$  means including a, but not b
+        """
+        assert 'type' in cfg, ''
+        cfg = copy.deepcopy(cfg)
+
+        if 'params' not in cfg:
+            return model.parameters()
+
+        assert isinstance(cfg['params'], list), ''
+
+        param_groups = []
+        visited = []
+        for pg in cfg['params']:
+            pattern = pg['params']
+            params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0}
+            pg['params'] = params.values()
+            param_groups.append(pg)
+            visited.extend(list(params.keys()))
+            # print(params.keys())
+
+        names = [k for k, v in model.named_parameters() if v.requires_grad]
+
+        if len(visited) < len(names):
+            unseen = set(names) - set(visited)
+            params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen}
+            param_groups.append({'params': params.values()})
+            visited.extend(list(params.keys()))
+            # print(params.keys())
+
+        assert len(visited) == len(names), ''
+
+        return param_groups
+
+    @staticmethod
+    def get_rank_batch_size(cfg):
+        """compute batch size for per rank if total_batch_size is provided.
+        """
+        assert ('total_batch_size' in cfg or 'batch_size' in cfg) \
+            and not ('total_batch_size' in cfg and 'batch_size' in cfg), \
+                '`batch_size` or `total_batch_size` should be choosed one'
+
+        total_batch_size = cfg.get('total_batch_size', None)
+        if total_batch_size is None:
+            bs = cfg.get('batch_size')
+        else:
+            from deim._engine.misc import dist_utils
+            assert total_batch_size % dist_utils.get_world_size() == 0, \
+                'total_batch_size should be divisible by world size'
+            bs = total_batch_size // dist_utils.get_world_size()
+        return bs
+
+    def build_dataloader(self, name: str):
+        bs = self.get_rank_batch_size(self.yaml_cfg[name])
+        global_cfg = self.global_cfg
+        if 'total_batch_size' in global_cfg[name]:
+            # pop unexpected key for dataloader init
+            _ = global_cfg[name].pop('total_batch_size')
+        print(f'building {name} with batch_size={bs}...')
+        loader = create(name, global_cfg, batch_size=bs)
+        loader.shuffle = self.yaml_cfg[name].get('shuffle', False)
+        return loader
diff --git a/deim/_engine/core/yaml_utils.py b/deim/_engine/core/yaml_utils.py
new file mode 100644
index 00000000..0c768ed7
--- /dev/null
+++ b/deim/_engine/core/yaml_utils.py
@@ -0,0 +1,126 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import os
+import copy
+import yaml
+from typing import Any, Dict, Optional, List
+
+from deim._engine.core.workspace import GLOBAL_CONFIG
+
+__all__ = [
+    'load_config',
+    'merge_config',
+    'merge_dict',
+    'parse_cli',
+]
+
+
+INCLUDE_KEY = '__include__'
+
+
+def load_config(file_path, cfg=dict()):
+    """load config
+    """
+    _, ext = os.path.splitext(file_path)
+    assert ext in ['.yml', '.yaml'], "only support yaml files"
+
+    with open(file_path) as f:
+        file_cfg = yaml.load(f, Loader=yaml.Loader)
+        if file_cfg is None:
+            return {}
+
+    if INCLUDE_KEY in file_cfg:
+        base_yamls = list(file_cfg[INCLUDE_KEY])
+        for base_yaml in base_yamls:
+            if base_yaml.startswith('~'):
+                base_yaml = os.path.expanduser(base_yaml)
+
+            if not base_yaml.startswith('/'):
+                base_yaml = os.path.join(os.path.dirname(file_path), base_yaml)
+
+            with open(base_yaml) as f:
+                base_cfg = load_config(base_yaml, cfg)
+                merge_dict(cfg, base_cfg)
+
+    return merge_dict(cfg, file_cfg)
+
+
+def merge_dict(dct, another_dct, inplace=True) -> Dict:
+    """merge another_dct into dct
+    """
+    def _merge(dct, another) -> Dict:
+        for k in another:
+            if (k in dct and isinstance(dct[k], dict) and isinstance(another[k], dict)):
+                _merge(dct[k], another[k])
+            else:
+                dct[k] = another[k]
+
+        return dct
+
+    if not inplace:
+        dct = copy.deepcopy(dct)
+
+    return _merge(dct, another_dct)
+
+
+def dictify(s: str, v: Any) -> Dict:
+    if '.' not in s:
+        return {s: v}
+    key, rest = s.split('.', 1)
+    return {key: dictify(rest, v)}
+
+
+def parse_cli(nargs: List[str]) -> Dict:
+    """
+    parse command-line arguments
+        convert `a.c=3 b=10` to `{'a': {'c': 3}, 'b': 10}`
+    """
+    cfg = {}
+    if nargs is None or len(nargs) == 0:
+        return cfg
+
+    for s in nargs:
+        s = s.strip()
+        k, v = s.split('=', 1)
+        d = dictify(k, yaml.load(v, Loader=yaml.Loader))
+        cfg = merge_dict(cfg, d)
+
+    return cfg
+
+
+
+def merge_config(cfg, another_cfg=GLOBAL_CONFIG, inplace: bool=False, overwrite: bool=False):
+    """
+    Merge another_cfg into cfg, return the merged config
+
+    Example:
+
+        cfg1 = load_config('./dfine_r18vd_6x_coco.yml')
+        cfg1 = merge_config(cfg, inplace=True)
+
+        cfg2 = load_config('./dfine_r50vd_6x_coco.yml')
+        cfg2 = merge_config(cfg2, inplace=True)
+
+        model1 = create(cfg1['model'], cfg1)
+        model2 = create(cfg2['model'], cfg2)
+    """
+    def _merge(dct, another):
+        for k in another:
+            if k not in dct:
+                dct[k] = another[k]
+
+            elif isinstance(dct[k], dict) and isinstance(another[k], dict):
+                _merge(dct[k], another[k])
+
+            elif overwrite:
+                dct[k] = another[k]
+
+        return cfg
+
+    if not inplace:
+        cfg = copy.deepcopy(cfg)
+
+    return _merge(cfg, another_cfg)
diff --git a/deim/_engine/data/__init__.py b/deim/_engine/data/__init__.py
new file mode 100644
index 00000000..21b13a31
--- /dev/null
+++ b/deim/_engine/data/__init__.py
@@ -0,0 +1,23 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from deim._engine.data.dataset import *
+from deim._engine.data.transforms import *
+from deim._engine.data.dataloader import *
+
+from deim._engine.data._misc import convert_to_tv_tensor
+
+
+
+
+# def set_epoch(self, epoch) -> None:
+#     self.epoch = epoch
+# def _set_epoch_func(datasets):
+#     """Add `set_epoch` for datasets
+#     """
+#     from ..core import register
+#     for ds in datasets:
+#         register(ds)(set_epoch)
+# _set_epoch_func([CIFAR10, VOCDetection, CocoDetection])
diff --git a/engine/data/_misc.py b/deim/_engine/data/_misc.py
similarity index 100%
rename from engine/data/_misc.py
rename to deim/_engine/data/_misc.py
diff --git a/deim/_engine/data/dataloader.py b/deim/_engine/data/dataloader.py
new file mode 100644
index 00000000..3e21ce2c
--- /dev/null
+++ b/deim/_engine/data/dataloader.py
@@ -0,0 +1,199 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2024 D-FINE authors. All Rights Reserved.
+"""
+
+import torch
+import torch.utils.data as data
+import torch.nn.functional as F
+from torch.utils.data import default_collate
+
+import torchvision
+import torchvision.transforms.v2 as VT
+from torchvision.transforms.v2 import functional as VF, InterpolationMode
+
+import random
+from functools import partial
+
+from deim._engine.core import register
+torchvision.disable_beta_transforms_warning()
+from copy import deepcopy
+from PIL import Image, ImageDraw
+import os
+
+
+__all__ = [
+    'DataLoader',
+    'BaseCollateFunction',
+    'BatchImageCollateFunction',
+    'batch_image_collate_fn'
+]
+
+
+@register()
+class DataLoader(data.DataLoader):
+    __inject__ = ['dataset', 'collate_fn']
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        for n in ['dataset', 'batch_size', 'num_workers', 'drop_last', 'collate_fn']:
+            format_string += "\n"
+            format_string += "    {0}: {1}".format(n, getattr(self, n))
+        format_string += "\n)"
+        return format_string
+
+    def set_epoch(self, epoch):
+        self._epoch = epoch
+        self.dataset.set_epoch(epoch)
+        self.collate_fn.set_epoch(epoch)
+
+    @property
+    def epoch(self):
+        return self._epoch if hasattr(self, '_epoch') else -1
+
+    @property
+    def shuffle(self):
+        return self._shuffle
+
+    @shuffle.setter
+    def shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), 'shuffle must be a boolean'
+        self._shuffle = shuffle
+
+
+@register()
+def batch_image_collate_fn(items):
+    """only batch image
+    """
+    return torch.cat([x[0][None] for x in items], dim=0), [x[1] for x in items]
+
+
+class BaseCollateFunction(object):
+    def set_epoch(self, epoch):
+        self._epoch = epoch
+
+    @property
+    def epoch(self):
+        return self._epoch if hasattr(self, '_epoch') else -1
+
+    def __call__(self, items):
+        raise NotImplementedError('')
+
+
+def generate_scales(base_size, base_size_repeat):
+    scale_repeat = (base_size - int(base_size * 0.75 / 32) * 32) // 32
+    scales = [int(base_size * 0.75 / 32) * 32 + i * 32 for i in range(scale_repeat)]
+    scales += [base_size] * base_size_repeat
+    scales += [int(base_size * 1.25 / 32) * 32 - i * 32 for i in range(scale_repeat)]
+    return scales
+
+
+@register() 
+class BatchImageCollateFunction(BaseCollateFunction):
+    def __init__(
+        self, 
+        stop_epoch=None, 
+        ema_restart_decay=0.9999,
+        base_size=640,
+        base_size_repeat=None,
+        mixup_prob=0.0,
+        mixup_epochs=[0, 0],
+        data_vis=False,
+        vis_save='./vis_dataset/'
+    ) -> None:
+        super().__init__()
+        self.base_size = base_size
+        self.scales = generate_scales(base_size, base_size_repeat) if base_size_repeat is not None else None
+        self.stop_epoch = stop_epoch if stop_epoch is not None else 100000000
+        self.ema_restart_decay = ema_restart_decay
+        # FIXME Mixup
+        self.mixup_prob, self.mixup_epochs = mixup_prob, mixup_epochs
+        if self.mixup_prob > 0:
+            self.data_vis, self.vis_save = data_vis, vis_save
+            os.makedirs(self.vis_save, exist_ok=True) if self.data_vis else None
+            print("     ### Using MixUp with Prob@{} in {} epochs ### ".format(self.mixup_prob, self.mixup_epochs))
+        if stop_epoch is not None:
+            print("     ### Multi-scale Training until {} epochs ### ".format(self.stop_epoch))
+            print("     ### Multi-scales@ {} ###        ".format(self.scales))
+        self.print_info_flag = True
+        # self.interpolation = interpolation
+
+    def apply_mixup(self, images, targets):
+        """
+        Applies Mixup augmentation to the batch if conditions are met.
+
+        Args:
+            images (torch.Tensor): Batch of images.
+            targets (list[dict]): List of target dictionaries corresponding to images.
+
+        Returns:
+            tuple: Updated images and targets
+        """
+        # Log when Mixup is permanently disabled
+        if self.epoch == self.mixup_epochs[-1] and self.print_info_flag:
+            print(f"     ### Attention --- Mixup is closed after epoch@ {self.epoch} ###")
+            self.print_info_flag = False
+
+        # Apply Mixup if within specified epoch range and probability threshold
+        if random.random() < self.mixup_prob and self.mixup_epochs[0] <= self.epoch < self.mixup_epochs[-1]:
+            # Generate mixup ratio
+            beta = round(random.uniform(0.45, 0.55), 6)
+
+            # Mix images
+            images = images.roll(shifts=1, dims=0).mul_(1.0 - beta).add_(images.mul(beta))
+
+            # Prepare targets for Mixup
+            shifted_targets = targets[-1:] + targets[:-1]
+            updated_targets = deepcopy(targets)
+
+            for i in range(len(targets)):
+                # Combine boxes, labels, and areas from original and shifted targets
+                updated_targets[i]['boxes'] = torch.cat([targets[i]['boxes'], shifted_targets[i]['boxes']], dim=0)
+                updated_targets[i]['labels'] = torch.cat([targets[i]['labels'], shifted_targets[i]['labels']], dim=0)
+                updated_targets[i]['area'] = torch.cat([targets[i]['area'], shifted_targets[i]['area']], dim=0)
+
+                # Add mixup ratio to targets
+                updated_targets[i]['mixup'] = torch.tensor(
+                    [beta] * len(targets[i]['labels']) + [1.0 - beta] * len(shifted_targets[i]['labels']), 
+                    dtype=torch.float32
+                    )
+            targets = updated_targets
+
+            if self.data_vis:
+                for i in range(len(updated_targets)):
+                    image_tensor = images[i]
+                    image_tensor_uint8 = (image_tensor * 255).type(torch.uint8)
+                    image_numpy = image_tensor_uint8.numpy().transpose((1, 2, 0))
+                    pilImage = Image.fromarray(image_numpy)
+                    draw = ImageDraw.Draw(pilImage)
+                    print('mix_vis:', i, 'boxes.len=', len(updated_targets[i]['boxes']))
+                    for box in updated_targets[i]['boxes']:
+                        draw.rectangle([int(box[0]*640 - (box[2]*640)/2), int(box[1]*640 - (box[3]*640)/2), 
+                                        int(box[0]*640 + (box[2]*640)/2), int(box[1]*640 + (box[3]*640)/2)], outline=(255,255,0))
+                    pilImage.save(self.vis_save + str(i) + "_"+ str(len(updated_targets[i]['boxes'])) +'_out.jpg')
+
+        return images, targets
+
+    def __call__(self, items):
+        images = torch.cat([x[0][None] for x in items], dim=0)
+        targets = [x[1] for x in items]
+
+        # Mixup
+        images, targets = self.apply_mixup(images, targets)
+
+        if self.scales is not None and self.epoch < self.stop_epoch:
+            # sz = random.choice(self.scales)
+            # sz = [sz] if isinstance(sz, int) else list(sz)
+            # VF.resize(inpt, sz, interpolation=self.interpolation)
+
+            sz = random.choice(self.scales)
+            images = F.interpolate(images, size=sz)
+            if 'masks' in targets[0]:
+                for tg in targets:
+                    tg['masks'] = F.interpolate(tg['masks'], size=sz, mode='nearest')
+                raise NotImplementedError('')
+
+        return images, targets
diff --git a/deim/_engine/data/dataset/__init__.py b/deim/_engine/data/dataset/__init__.py
new file mode 100644
index 00000000..14bfd533
--- /dev/null
+++ b/deim/_engine/data/dataset/__init__.py
@@ -0,0 +1,16 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+# from ._dataset import DetDataset
+from deim._engine.data.dataset.coco_dataset import CocoDetection
+from deim._engine.data.dataset.coco_dataset import (
+    mscoco_category2name,
+    mscoco_category2label,
+    mscoco_label2category,
+)
+from deim._engine.data.dataset.coco_eval import CocoEvaluator
+from deim._engine.data.dataset.coco_utils import get_coco_api_from_dataset
+from deim._engine.data.dataset.voc_detection import VOCDetection
+from deim._engine.data.dataset.voc_eval import VOCEvaluator
diff --git a/engine/data/dataset/_dataset.py b/deim/_engine/data/dataset/_dataset.py
similarity index 100%
rename from engine/data/dataset/_dataset.py
rename to deim/_engine/data/dataset/_dataset.py
diff --git a/deim/_engine/data/dataset/coco_dataset.py b/deim/_engine/data/dataset/coco_dataset.py
new file mode 100644
index 00000000..211ed5db
--- /dev/null
+++ b/deim/_engine/data/dataset/coco_dataset.py
@@ -0,0 +1,264 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.utils.data
+
+import torchvision
+
+from PIL import Image
+import faster_coco_eval
+import faster_coco_eval.core.mask as coco_mask
+from deim._engine.data.dataset._dataset import DetDataset
+from deim._engine.data._misc import convert_to_tv_tensor
+from deim._engine.core import register
+
+torchvision.disable_beta_transforms_warning()
+faster_coco_eval.init_as_pycocotools()
+Image.MAX_IMAGE_PIXELS = None
+
+__all__ = ['CocoDetection']
+
+
+@register()
+class CocoDetection(torchvision.datasets.CocoDetection, DetDataset):
+    __inject__ = ['transforms', ]
+    __share__ = ['remap_mscoco_category']
+
+    def __init__(self, img_folder, ann_file, transforms, return_masks=False, remap_mscoco_category=False):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self.return_masks = return_masks
+        self.remap_mscoco_category = remap_mscoco_category
+
+    def __getitem__(self, idx):
+        img, target = self.load_item(idx)
+        if self._transforms is not None:
+            img, target, _ = self._transforms(img, target, self)
+        return img, target
+
+    def load_item(self, idx):
+        image, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+
+        if self.remap_mscoco_category:
+            image, target = self.prepare(image, target, category2label=mscoco_category2label)
+        else:
+            image, target = self.prepare(image, target)
+
+        target['idx'] = torch.tensor([idx])
+
+        if 'boxes' in target:
+            target['boxes'] = convert_to_tv_tensor(target['boxes'], key='boxes', spatial_size=image.size[::-1])
+
+        if 'masks' in target:
+            target['masks'] = convert_to_tv_tensor(target['masks'], key='masks')
+
+        return image, target
+
+    def extra_repr(self) -> str:
+        s = f' img_folder: {self.img_folder}\n ann_file: {self.ann_file}\n'
+        s += f' return_masks: {self.return_masks}\n'
+        if hasattr(self, '_transforms') and self._transforms is not None:
+            s += f' transforms:\n   {repr(self._transforms)}'
+        if hasattr(self, '_preset') and self._preset is not None:
+            s += f' preset:\n   {repr(self._preset)}'
+        return s
+
+    @property
+    def categories(self, ):
+        return self.coco.dataset['categories']
+
+    @property
+    def category2name(self, ):
+        return {cat['id']: cat['name'] for cat in self.categories}
+
+    @property
+    def category2label(self, ):
+        return {cat['id']: i for i, cat in enumerate(self.categories)}
+
+    @property
+    def label2category(self, ):
+        return {i: cat['id'] for i, cat in enumerate(self.categories)}
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+
+    def __call__(self, image: Image.Image, target, **kwargs):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        category2label = kwargs.get('category2label', None)
+        if category2label is not None:
+            labels = [category2label[obj["category_id"]] for obj in anno]
+        else:
+            labels = [obj["category_id"] for obj in anno]
+
+        labels = torch.tensor(labels, dtype=torch.int64)
+
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        labels = labels[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = labels
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = torch.as_tensor([int(w), int(h)])
+        # target["size"] = torch.as_tensor([int(w), int(h)])
+
+        return image, target
+
+
+mscoco_category2name = {
+    1: 'person',
+    2: 'bicycle',
+    3: 'car',
+    4: 'motorcycle',
+    5: 'airplane',
+    6: 'bus',
+    7: 'train',
+    8: 'truck',
+    9: 'boat',
+    10: 'traffic light',
+    11: 'fire hydrant',
+    13: 'stop sign',
+    14: 'parking meter',
+    15: 'bench',
+    16: 'bird',
+    17: 'cat',
+    18: 'dog',
+    19: 'horse',
+    20: 'sheep',
+    21: 'cow',
+    22: 'elephant',
+    23: 'bear',
+    24: 'zebra',
+    25: 'giraffe',
+    27: 'backpack',
+    28: 'umbrella',
+    31: 'handbag',
+    32: 'tie',
+    33: 'suitcase',
+    34: 'frisbee',
+    35: 'skis',
+    36: 'snowboard',
+    37: 'sports ball',
+    38: 'kite',
+    39: 'baseball bat',
+    40: 'baseball glove',
+    41: 'skateboard',
+    42: 'surfboard',
+    43: 'tennis racket',
+    44: 'bottle',
+    46: 'wine glass',
+    47: 'cup',
+    48: 'fork',
+    49: 'knife',
+    50: 'spoon',
+    51: 'bowl',
+    52: 'banana',
+    53: 'apple',
+    54: 'sandwich',
+    55: 'orange',
+    56: 'broccoli',
+    57: 'carrot',
+    58: 'hot dog',
+    59: 'pizza',
+    60: 'donut',
+    61: 'cake',
+    62: 'chair',
+    63: 'couch',
+    64: 'potted plant',
+    65: 'bed',
+    67: 'dining table',
+    70: 'toilet',
+    72: 'tv',
+    73: 'laptop',
+    74: 'mouse',
+    75: 'remote',
+    76: 'keyboard',
+    77: 'cell phone',
+    78: 'microwave',
+    79: 'oven',
+    80: 'toaster',
+    81: 'sink',
+    82: 'refrigerator',
+    84: 'book',
+    85: 'clock',
+    86: 'vase',
+    87: 'scissors',
+    88: 'teddy bear',
+    89: 'hair drier',
+    90: 'toothbrush'
+}
+
+mscoco_category2label = {k: i for i, k in enumerate(mscoco_category2name.keys())}
+mscoco_label2category = {v: k for k, v in mscoco_category2label.items()}
diff --git a/deim/_engine/data/dataset/coco_eval.py b/deim/_engine/data/dataset/coco_eval.py
new file mode 100644
index 00000000..76f3f4ca
--- /dev/null
+++ b/deim/_engine/data/dataset/coco_eval.py
@@ -0,0 +1,200 @@
+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+COCO evaluator that works in distributed mode.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+import os
+import contextlib
+import copy
+import numpy as np
+import torch
+
+from faster_coco_eval import COCO, COCOeval_faster
+import faster_coco_eval.core.mask as mask_util
+from deim._engine.core import register
+from deim._engine.misc import dist_utils
+__all__ = ['CocoEvaluator',]
+
+
+@register()
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt : COCO = coco_gt
+        self.iou_types = iou_types
+
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval_faster(coco_gt, iouType=iou_type, print_function=print, separate_eval=True)
+
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+
+    def cleanup(self):
+        self.coco_eval = {}
+        for iou_type in self.iou_types:
+            self.coco_eval[iou_type] = COCOeval_faster(self.coco_gt, iouType=iou_type, print_function=print, separate_eval=True)
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in self.iou_types}
+
+
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            coco_eval = self.coco_eval[iou_type]
+
+            # suppress pycocotools prints
+            with open(os.devnull, 'w') as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = self.coco_gt.loadRes(results) if results else COCO()
+                    coco_eval.cocoDt = coco_dt
+                    coco_eval.params.imgIds = list(img_ids)
+                    coco_eval.evaluate()
+
+            self.eval_imgs[iou_type].append(np.array(coco_eval._evalImgs_cpp).reshape(len(coco_eval.params.catIds), len(coco_eval.params.areaRng), len(coco_eval.params.imgIds)))
+
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            img_ids, eval_imgs = merge(self.img_ids, self.eval_imgs[iou_type])
+
+            coco_eval = self.coco_eval[iou_type]
+            coco_eval.params.imgIds = img_ids
+            coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+            coco_eval._evalImgs_cpp = eval_imgs
+
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+
+            masks = masks > 0.5
+
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        'keypoints': keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+
+
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+
+def merge(img_ids, eval_imgs):
+    all_img_ids = dist_utils.all_gather(img_ids)
+    all_eval_imgs = dist_utils.all_gather(eval_imgs)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.extend(p)
+
+
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, axis=2).ravel()
+    # merged_eval_imgs = np.array(merged_eval_imgs).T.ravel()
+
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+
+    return merged_img_ids.tolist(), merged_eval_imgs.tolist()
diff --git a/engine/data/dataset/coco_utils.py b/deim/_engine/data/dataset/coco_utils.py
similarity index 100%
rename from engine/data/dataset/coco_utils.py
rename to deim/_engine/data/dataset/coco_utils.py
diff --git a/deim/_engine/data/dataset/voc_detection.py b/deim/_engine/data/dataset/voc_detection.py
new file mode 100644
index 00000000..9b915609
--- /dev/null
+++ b/deim/_engine/data/dataset/voc_detection.py
@@ -0,0 +1,76 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from sympy import im
+import torch
+import torchvision
+import torchvision.transforms.functional as TVF
+
+import os
+from PIL import Image
+from typing import Optional, Callable
+
+try:
+    from defusedxml.ElementTree import parse as ET_parse
+except ImportError:
+    from xml.etree.ElementTree import parse as ET_parse
+
+from deim._engine.data.dataset._dataset import DetDataset
+from deim._engine.data._misc import convert_to_tv_tensor
+from deim._engine.core import register
+
+@register()
+class VOCDetection(torchvision.datasets.VOCDetection, DetDataset):
+    __inject__ = ['transforms', ]
+
+    def __init__(self, root: str, ann_file: str = "trainval.txt", label_file: str = "label_list.txt", transforms: Optional[Callable] = None):
+
+        with open(os.path.join(root, ann_file), 'r') as f:
+            lines = [x.strip() for x in f.readlines()]
+            lines = [x.split(' ') for x in lines]
+
+        self.images = [os.path.join(root, lin[0]) for lin in lines]
+        self.targets = [os.path.join(root, lin[1]) for lin in lines]
+        assert len(self.images) == len(self.targets)
+
+        with open(os.path.join(root + label_file), 'r') as f:
+            labels = f.readlines()
+            labels = [lab.strip() for lab in labels]
+
+        self.transforms = transforms
+        self.labels_map = {lab: i for i, lab in enumerate(labels)}
+
+    def __getitem__(self, index: int):
+        image, target = self.load_item(index)
+        if self.transforms is not None:
+            image, target, _ = self.transforms(image, target, self)
+        # target["orig_size"] = torch.tensor(TVF.get_image_size(image))
+        return image, target
+
+    def load_item(self, index: int):
+        image = Image.open(self.images[index]).convert("RGB")
+        target = self.parse_voc_xml(ET_parse(self.annotations[index]).getroot())
+
+        output = {}
+        output["image_id"] = torch.tensor([index])
+        for k in ['area', 'boxes', 'labels', 'iscrowd']:
+            output[k] = []
+
+        for blob in target['annotation']['object']:
+            box = [float(v) for v in blob['bndbox'].values()]
+            output["boxes"].append(box)
+            output["labels"].append(blob['name'])
+            output["area"].append((box[2] - box[0]) * (box[3] - box[1]))
+            output["iscrowd"].append(0)
+
+        w, h = image.size
+        boxes = torch.tensor(output["boxes"]) if len(output["boxes"]) > 0 else torch.zeros(0, 4)
+        output['boxes'] = convert_to_tv_tensor(boxes, 'boxes', box_format='xyxy', spatial_size=[h, w])
+        output['labels'] = torch.tensor([self.labels_map[lab] for lab in output["labels"]])
+        output['area'] = torch.tensor(output['area'])
+        output["iscrowd"] = torch.tensor(output["iscrowd"])
+        output["orig_size"] = torch.tensor([w, h])
+
+        return image, output
diff --git a/engine/data/dataset/voc_eval.py b/deim/_engine/data/dataset/voc_eval.py
similarity index 100%
rename from engine/data/dataset/voc_eval.py
rename to deim/_engine/data/dataset/voc_eval.py
diff --git a/deim/_engine/data/transforms/__init__.py b/deim/_engine/data/transforms/__init__.py
new file mode 100644
index 00000000..e591d259
--- /dev/null
+++ b/deim/_engine/data/transforms/__init__.py
@@ -0,0 +1,22 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+from deim._engine.data.transforms._transforms import (
+    EmptyTransform,
+    RandomPhotometricDistort,
+    RandomZoomOut,
+    RandomIoUCrop,
+    RandomHorizontalFlip,
+    Resize,
+    PadToSize,
+    SanitizeBoundingBoxes,
+    RandomCrop,
+    Normalize,
+    ConvertBoxes,
+    ConvertPILImage,
+)
+from deim._engine.data.transforms.container import Compose
+from deim._engine.data.transforms.mosaic import Mosaic
\ No newline at end of file
diff --git a/engine/data/transforms/_transforms.py b/deim/_engine/data/transforms/_transforms.py
similarity index 89%
rename from engine/data/transforms/_transforms.py
rename to deim/_engine/data/transforms/_transforms.py
index 31588df5..7f51a9bc 100644
--- a/engine/data/transforms/_transforms.py
+++ b/deim/_engine/data/transforms/_transforms.py
@@ -15,11 +15,11 @@
 
 from typing import Any, Dict, List, Optional
 
-from .._misc import convert_to_tv_tensor, _boxes_keys
-from .._misc import Image, Video, Mask, BoundingBoxes
-from .._misc import SanitizeBoundingBoxes
+from deim._engine.data._misc import convert_to_tv_tensor, _boxes_keys
+from deim._engine.data._misc import Image, Video, Mask, BoundingBoxes
+from deim._engine.data._misc import SanitizeBoundingBoxes
 
-from ...core import register
+from deim._engine.core import register
 torchvision.disable_beta_transforms_warning()
 
 
@@ -100,6 +100,9 @@ def __init__(self, fmt='', normalize=False) -> None:
         super().__init__()
         self.fmt = fmt
         self.normalize = normalize
+    
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         spatial_size = getattr(inpt, _boxes_keys[1])
@@ -123,6 +126,9 @@ def __init__(self, dtype='float32', scale=True) -> None:
         super().__init__()
         self.dtype = dtype
         self.scale = scale
+    
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         inpt = F.pil_to_tensor(inpt)
diff --git a/deim/_engine/data/transforms/container.py b/deim/_engine/data/transforms/container.py
new file mode 100644
index 00000000..0678e1c3
--- /dev/null
+++ b/deim/_engine/data/transforms/container.py
@@ -0,0 +1,127 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2024 D-FINE authors. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+
+import torchvision
+import torchvision.transforms.v2 as T
+
+from typing import Any, Dict, List, Optional
+
+from deim._engine.data.transforms._transforms import EmptyTransform
+from deim._engine.core import register, GLOBAL_CONFIG
+torchvision.disable_beta_transforms_warning()
+import random
+
+
+@register()
+class Compose(T.Compose):
+    def __init__(self, ops, policy=None, mosaic_prob=-0.1) -> None:
+        transforms = []
+        if ops is not None:
+            for op in ops:
+                if isinstance(op, dict):
+                    name = op.pop('type')
+                    transform = getattr(GLOBAL_CONFIG[name]['_pymodule'], GLOBAL_CONFIG[name]['_name'])(**op)
+                    transforms.append(transform)
+                    op['type'] = name
+                    print("     ### Transform @{} ###    ".format(type(transform).__name__))
+
+                elif isinstance(op, nn.Module):
+                    transforms.append(op)
+
+                else:
+                    raise ValueError('')
+        else:
+            transforms =[EmptyTransform(), ]
+
+        super().__init__(transforms=transforms)
+
+        self.mosaic_prob = mosaic_prob
+        if policy is None:
+            policy = {'name': 'default'}
+        else:
+            if self.mosaic_prob > 0: 
+                print("     ### Mosaic with Prob.@{} and ZoomOut/IoUCrop existed ### ".format(self.mosaic_prob))
+            print("     ### ImgTransforms Epochs: {} ### ".format(policy['epoch']))
+            print('     ### Policy_ops@{} ###'.format(policy['ops']))
+        self.global_samples = 0
+        self.policy = policy
+
+    def forward(self, *inputs: Any) -> Any:
+        return self.get_forward(self.policy['name'])(*inputs)
+
+    def get_forward(self, name):
+        forwards = {
+            'default': self.default_forward,
+            'stop_epoch': self.stop_epoch_forward,
+            'stop_sample': self.stop_sample_forward,
+        }
+        return forwards[name]
+
+    def default_forward(self, *inputs: Any) -> Any:
+        sample = inputs if len(inputs) > 1 else inputs[0]
+        for transform in self.transforms:
+            sample = transform(sample)
+        return sample
+
+    def stop_epoch_forward(self, *inputs: Any):
+        sample = inputs if len(inputs) > 1 else inputs[0]
+        dataset = sample[-1]
+        cur_epoch = dataset.epoch
+        policy_ops = self.policy['ops']
+        policy_epoch = self.policy['epoch']
+
+        if isinstance(policy_epoch, list) and len(policy_epoch) == 3:     # 4-stages
+            if policy_epoch[0] <= cur_epoch < policy_epoch[1]:
+                with_mosaic = random.random() <= self.mosaic_prob       # Probility for Mosaic
+            else:
+                with_mosaic = False
+            for transform in self.transforms:
+                # TODO print the transform to get the order
+                if (type(transform).__name__ in policy_ops and cur_epoch < policy_epoch[0]):   # first stage: NoAug
+                    pass
+                elif (type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch[-1]):    # last stage: NoAug
+                    pass
+                else:
+                    # Using Mosaic for [policy_epoch[0], policy_epoch[1]] with probability
+                    if (type(transform).__name__ == 'Mosaic' and not with_mosaic):      
+                        pass
+                    # Mosaic and Zoomout/IoUCrop can not be co-existed in the same sample
+                    elif (type(transform).__name__ == 'RandomZoomOut' or type(transform).__name__ == 'RandomIoUCrop') and with_mosaic:      
+                        pass
+                    else:
+                        sample = transform(sample)
+        else:   # the default data scheduler
+            for transform in self.transforms:
+                if type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch:
+                    pass
+                else:
+                    sample = transform(sample)
+
+        return sample
+
+
+    def stop_sample_forward(self, *inputs: Any):
+        sample = inputs if len(inputs) > 1 else inputs[0]
+        dataset = sample[-1]
+
+        cur_epoch = dataset.epoch
+        policy_ops = self.policy['ops']
+        policy_sample = self.policy['sample']
+
+        for transform in self.transforms:
+            if type(transform).__name__ in policy_ops and self.global_samples >= policy_sample:
+                pass
+            else:
+                sample = transform(sample)
+
+        self.global_samples += 1
+
+        return sample
diff --git a/engine/data/transforms/functional.py b/deim/_engine/data/transforms/functional.py
similarity index 100%
rename from engine/data/transforms/functional.py
rename to deim/_engine/data/transforms/functional.py
diff --git a/deim/_engine/data/transforms/mosaic.py b/deim/_engine/data/transforms/mosaic.py
new file mode 100644
index 00000000..dfea507e
--- /dev/null
+++ b/deim/_engine/data/transforms/mosaic.py
@@ -0,0 +1,168 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+"""
+
+import torch
+import torchvision.transforms.v2 as T
+import torchvision.transforms.v2.functional as F
+import random
+from PIL import Image
+
+from deim._engine.data._misc import convert_to_tv_tensor
+from deim._engine.core import register
+
+
+@register()
+class Mosaic(T.Transform):
+    """
+    Applies Mosaic augmentation to a batch of images. Combines four randomly selected images
+    into a single composite image with randomized transformations.
+    """
+
+    def __init__(self, output_size=320, max_size=None, rotation_range=0, translation_range=(0.1, 0.1),
+                 scaling_range=(0.5, 1.5), probability=1.0, fill_value=114, use_cache=True, max_cached_images=50,
+                 random_pop=True) -> None:
+        """
+        Args:
+            output_size (int): Target size for resizing individual images.
+            rotation_range (float): Range of rotation in degrees for affine transformation.
+            translation_range (tuple): Range of translation for affine transformation.
+            scaling_range (tuple): Range of scaling factors for affine transformation.
+            probability (float): Probability of applying the Mosaic augmentation.
+            fill_value (int): Fill value for padding or affine transformations.
+            use_cache (bool): Whether to use cache. Defaults to True.
+            max_cached_images (int): The maximum length of the cache.
+            random_pop (bool): Whether to randomly pop a result from the cache.
+        """
+        super().__init__()
+        self.resize = T.Resize(size=output_size, max_size=max_size)
+        self.probability = probability
+        self.affine_transform = T.RandomAffine(degrees=rotation_range, translate=translation_range,
+                                               scale=scaling_range, fill=fill_value)
+        self.use_cache = use_cache
+        self.mosaic_cache = []
+        self.max_cached_images = max_cached_images
+        self.random_pop = random_pop
+
+    def load_samples_from_dataset(self, image, target, dataset):
+        """Loads and resizes a set of images and their corresponding targets."""
+        # Append the main image
+        get_size_func = F.get_size if hasattr(F, "get_size") else F.get_spatial_size  # torchvision >=0.17 is get_size
+        image, target = self.resize(image, target)
+        resized_images, resized_targets = [image], [target]
+        max_height, max_width = get_size_func(resized_images[0])
+
+        # randomly select 3 images
+        sample_indices = random.choices(range(len(dataset)), k=3)
+        for idx in sample_indices:
+            # image, target = dataset.load_item(idx)
+            image, target = self.resize(dataset.load_item(idx))
+            height, width = get_size_func(image)
+            max_height, max_width = max(max_height, height), max(max_width, width)
+            resized_images.append(image)
+            resized_targets.append(target)
+
+        return resized_images, resized_targets, max_height, max_width
+
+    def load_samples_from_cache(self, image, target, cache):
+        image, target = self.resize(image, target)
+        cache.append(dict(img=image, labels=target))
+
+        if len(cache) > self.max_cached_images:
+            if self.random_pop:
+                index = random.randint(0, len(cache) - 2)  # do not remove last image
+            else:
+                index = 0
+            cache.pop(index)
+        sample_indices = random.choices(range(len(cache)), k=3)
+        mosaic_samples = [dict(img=cache[idx]["img"].copy(), labels=self._clone(cache[idx]["labels"])) for idx in
+                          sample_indices]  # sample 3 images
+        mosaic_samples = [dict(img=image.copy(), labels=self._clone(target))] + mosaic_samples
+
+        get_size_func = F.get_size if hasattr(F, "get_size") else F.get_spatial_size
+        sizes = [get_size_func(mosaic_samples[idx]["img"]) for idx in range(4)]
+        max_height = max(size[0] for size in sizes)
+        max_width = max(size[1] for size in sizes)
+
+        return mosaic_samples, max_height, max_width
+
+    def create_mosaic_from_cache(self, mosaic_samples, max_height, max_width):
+        placement_offsets = [[0, 0], [max_width, 0], [0, max_height], [max_width, max_height]]
+        merged_image = Image.new(mode=mosaic_samples[0]["img"].mode, size=(max_width * 2, max_height * 2), color=0)
+        offsets = torch.tensor([[0, 0], [max_width, 0], [0, max_height], [max_width, max_height]]).repeat(1, 2)
+
+        mosaic_target = []
+        for i, sample in enumerate(mosaic_samples):
+            img = sample["img"]
+            target = sample["labels"]
+
+            merged_image.paste(img, placement_offsets[i])
+            target['boxes'] = target['boxes'] + offsets[i]
+            mosaic_target.append(target)
+
+        merged_target = {}
+        for key in mosaic_target[0]:
+            merged_target[key] = torch.cat([target[key] for target in mosaic_target])
+
+        return merged_image, merged_target
+
+    def create_mosaic_from_dataset(self, images, targets, max_height, max_width):
+        """Creates a mosaic image by combining multiple images."""
+        placement_offsets = [[0, 0], [max_width, 0], [0, max_height], [max_width, max_height]]
+        merged_image = Image.new(mode=images[0].mode, size=(max_width * 2, max_height * 2), color=0)
+        for i, img in enumerate(images):
+            merged_image.paste(img, placement_offsets[i])
+
+        """Merges targets into a single target dictionary for the mosaic."""
+        offsets = torch.tensor([[0, 0], [max_width, 0], [0, max_height], [max_width, max_height]]).repeat(1, 2)
+        merged_target = {}
+        for key in targets[0]:
+            if key == 'boxes':
+                values = [target[key] + offsets[i] for i, target in enumerate(targets)]
+            else:
+                values = [target[key] for target in targets]
+
+            merged_target[key] = torch.cat(values, dim=0) if isinstance(values[0], torch.Tensor) else values
+
+        return merged_image, merged_target
+
+    @staticmethod
+    def _clone(tensor_dict):
+        return {key: value.clone() for (key, value) in tensor_dict.items()}
+
+    def forward(self, *inputs):
+        """
+        Args:
+            inputs (tuple): Input tuple containing (image, target, dataset).
+
+        Returns:
+            tuple: Augmented (image, target, dataset).
+        """
+        if len(inputs) == 1:
+            inputs = inputs[0]
+        image, target, dataset = inputs
+
+        # Skip mosaic augmentation with probability 1 - self.probability
+        if self.probability < 1.0 and random.random() > self.probability:
+            return image, target, dataset
+
+        # Prepare mosaic components
+        if self.use_cache:
+            mosaic_samples, max_height, max_width = self.load_samples_from_cache(image, target, self.mosaic_cache)
+            mosaic_image, mosaic_target = self.create_mosaic_from_cache(mosaic_samples, max_height, max_width)
+        else:
+            resized_images, resized_targets, max_height, max_width = self.load_samples_from_dataset(image, target,dataset)
+            mosaic_image, mosaic_target = self.create_mosaic_from_dataset(resized_images, resized_targets, max_height, max_width)
+
+        # Clamp boxes and convert target formats
+        if 'boxes' in mosaic_target:
+            mosaic_target['boxes'] = convert_to_tv_tensor(mosaic_target['boxes'], 'boxes', box_format='xyxy',
+                                                          spatial_size=mosaic_image.size[::-1])
+        if 'masks' in mosaic_target:
+            mosaic_target['masks'] = convert_to_tv_tensor(mosaic_target['masks'], 'masks')
+
+        # Apply affine transformations
+        mosaic_image, mosaic_target = self.affine_transform(mosaic_image, mosaic_target)
+
+        return mosaic_image, mosaic_target, dataset
diff --git a/engine/deim/__init__.py b/deim/_engine/deim/__init__.py
similarity index 100%
rename from engine/deim/__init__.py
rename to deim/_engine/deim/__init__.py
diff --git a/engine/deim/box_ops.py b/deim/_engine/deim/box_ops.py
similarity index 100%
rename from engine/deim/box_ops.py
rename to deim/_engine/deim/box_ops.py
diff --git a/engine/deim/deim.py b/deim/_engine/deim/deim.py
similarity index 100%
rename from engine/deim/deim.py
rename to deim/_engine/deim/deim.py
diff --git a/engine/deim/deim_criterion.py b/deim/_engine/deim/deim_criterion.py
similarity index 100%
rename from engine/deim/deim_criterion.py
rename to deim/_engine/deim/deim_criterion.py
diff --git a/engine/deim/denoising.py b/deim/_engine/deim/denoising.py
similarity index 100%
rename from engine/deim/denoising.py
rename to deim/_engine/deim/denoising.py
diff --git a/engine/deim/dfine_decoder.py b/deim/_engine/deim/dfine_decoder.py
similarity index 100%
rename from engine/deim/dfine_decoder.py
rename to deim/_engine/deim/dfine_decoder.py
diff --git a/engine/deim/dfine_utils.py b/deim/_engine/deim/dfine_utils.py
similarity index 100%
rename from engine/deim/dfine_utils.py
rename to deim/_engine/deim/dfine_utils.py
diff --git a/engine/deim/hybrid_encoder.py b/deim/_engine/deim/hybrid_encoder.py
similarity index 100%
rename from engine/deim/hybrid_encoder.py
rename to deim/_engine/deim/hybrid_encoder.py
diff --git a/deim/_engine/deim/matcher.py b/deim/_engine/deim/matcher.py
new file mode 100644
index 00000000..b1e6e1d5
--- /dev/null
+++ b/deim/_engine/deim/matcher.py
@@ -0,0 +1,134 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+Modules to compute the matching cost and solve the corresponding LSAP.
+
+Copyright (c) 2024 The D-FINE Authors All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from scipy.optimize import linear_sum_assignment
+from typing import Dict
+
+from deim._engine.deim.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+
+from deim._engine.core import register
+import numpy as np
+
+
+@register()
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    __share__ = ['use_focal_loss', ]
+
+    def __init__(self, weight_dict, use_focal_loss=False, alpha=0.25, gamma=2.0):
+        """Creates the matcher
+
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = weight_dict['cost_class']
+        self.cost_bbox = weight_dict['cost_bbox']
+        self.cost_giou = weight_dict['cost_giou']
+
+        self.use_focal_loss = use_focal_loss
+        self.alpha = alpha
+        self.gamma = gamma
+
+        assert self.cost_class != 0 or self.cost_bbox != 0 or self.cost_giou != 0, "all costs cant be 0"
+
+    @torch.no_grad()
+    def forward(self, outputs: Dict[str, torch.Tensor], targets, return_topk=False):
+        """ Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        if self.use_focal_loss:
+            out_prob = F.sigmoid(outputs["pred_logits"].flatten(0, 1))
+        else:
+            out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        if self.use_focal_loss:
+            out_prob = out_prob[:, tgt_ids]
+            neg_cost_class = (1 - self.alpha) * (out_prob ** self.gamma) * (-(1 - out_prob + 1e-8).log())
+            pos_cost_class = self.alpha * ((1 - out_prob) ** self.gamma) * (-(out_prob + 1e-8).log())
+            cost_class = pos_cost_class - neg_cost_class
+        else:
+            cost_class = -out_prob[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost betwen boxes
+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+
+        # Final cost matrix 3 * self.cost_bbox + 2 * self.cost_class + self.cost_giou
+        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+        C = C.view(bs, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        # FIXME，RT-DETR, different way to set NaN
+        C = torch.nan_to_num(C, nan=1.0)
+        indices_pre = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+        indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices_pre]
+
+        # Compute topk indices
+        if return_topk:
+            return {'indices_o2m': self.get_top_k_matches(C, sizes=sizes, k=return_topk, initial_indices=indices_pre)}
+
+        return {'indices': indices} # , 'indices_o2m': C.min(-1)[1]}
+
+    def get_top_k_matches(self, C, sizes, k=1, initial_indices=None):
+        indices_list = []
+        # C_original = C.clone()
+        for i in range(k):
+            indices_k = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] if i > 0 else initial_indices
+            indices_list.append([
+                (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+                for i, j in indices_k
+            ])
+            for c, idx_k in zip(C.split(sizes, -1), indices_k):
+                idx_k = np.stack(idx_k)
+                c[:, idx_k] = 1e6
+        indices_list = [(torch.cat([indices_list[i][j][0] for i in range(k)], dim=0),
+                        torch.cat([indices_list[i][j][1] for i in range(k)], dim=0)) for j in range(len(sizes))]
+        # C.copy_(C_original)
+        return indices_list
diff --git a/deim/_engine/deim/postprocessor.py b/deim/_engine/deim/postprocessor.py
new file mode 100644
index 00000000..5173bb25
--- /dev/null
+++ b/deim/_engine/deim/postprocessor.py
@@ -0,0 +1,95 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import torchvision
+
+from deim._engine.core import register
+
+
+__all__ = ['PostProcessor']
+
+
+def mod(a, b):
+    out = a - a // b * b
+    return out
+
+
+@register()
+class PostProcessor(nn.Module):
+    __share__ = [
+        'num_classes',
+        'use_focal_loss',
+        'num_top_queries',
+        'remap_mscoco_category'
+    ]
+
+    def __init__(
+        self,
+        num_classes=80,
+        use_focal_loss=True,
+        num_top_queries=300,
+        remap_mscoco_category=False
+    ) -> None:
+        super().__init__()
+        self.use_focal_loss = use_focal_loss
+        self.num_top_queries = num_top_queries
+        self.num_classes = int(num_classes)
+        self.remap_mscoco_category = remap_mscoco_category
+        self.deploy_mode = False
+
+    def extra_repr(self) -> str:
+        return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}'
+
+    # def forward(self, outputs, orig_target_sizes):
+    def forward(self, outputs, orig_target_sizes: torch.Tensor):
+        logits, boxes = outputs['pred_logits'], outputs['pred_boxes']
+        # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+
+        bbox_pred = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
+        bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
+
+        if self.use_focal_loss:
+            scores = F.sigmoid(logits)
+            scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1)
+            # TODO for older tensorrt
+            # labels = index % self.num_classes
+            labels = mod(index, self.num_classes)
+            index = index // self.num_classes
+            boxes = bbox_pred.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1]))
+
+        else:
+            scores = F.softmax(logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            if scores.shape[1] > self.num_top_queries:
+                scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+        # TODO for onnx export
+        if self.deploy_mode:
+            return labels, boxes, scores
+
+        # TODO
+        if self.remap_mscoco_category:
+            from deim._engine.data.dataset import mscoco_label2category
+            labels = torch.tensor([mscoco_label2category[int(x.item())] for x in labels.flatten()])\
+                .to(boxes.device).reshape(labels.shape)
+
+        results = []
+        for lab, box, sco in zip(labels, boxes, scores):
+            result = dict(labels=lab, boxes=box, scores=sco)
+            results.append(result)
+
+        return results
+
+
+    def deploy(self, ):
+        self.eval()
+        self.deploy_mode = True
+        return self
diff --git a/deim/_engine/deim/rtdetrv2_decoder.py b/deim/_engine/deim/rtdetrv2_decoder.py
new file mode 100644
index 00000000..ad30be8d
--- /dev/null
+++ b/deim/_engine/deim/rtdetrv2_decoder.py
@@ -0,0 +1,623 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+Modifications Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+"""
+
+import math 
+import copy 
+import functools
+from collections import OrderedDict
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+import torch.nn.init as init 
+from typing import List
+
+from deim._engine.deim.denoising import get_contrastive_denoising_training_group
+from deim._engine.deim.utils import bias_init_with_prob, get_activation, inverse_sigmoid
+from deim._engine.deim.utils import deformable_attention_core_func_v2
+
+from deim._engine.core import register
+
+__all__ = ['RTDETRTransformerv2']
+
+
+class MLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act='relu'):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+        self.act = get_activation(act)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class MSDeformableAttention(nn.Module):
+    def __init__(
+        self, 
+        embed_dim=256, 
+        num_heads=8, 
+        num_levels=4, 
+        num_points=4, 
+        method='default',
+        offset_scale=0.5,
+        value_shape='default',
+    ):
+        """Multi-Scale Deformable Attention
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.offset_scale = offset_scale
+
+        if isinstance(num_points, list):
+            assert len(num_points) == num_levels, ''
+            num_points_list = num_points
+        else:
+            num_points_list = [num_points for _ in range(num_levels)]
+
+        self.num_points_list = num_points_list
+        
+        num_points_scale = [1/n for n in num_points_list for _ in range(n)]
+        self.register_buffer('num_points_scale', torch.tensor(num_points_scale, dtype=torch.float32))
+
+        self.total_points = num_heads * sum(num_points_list)
+        self.method = method
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2)
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.ms_deformable_attn_core = functools.partial(deformable_attention_core_func_v2, 
+                                                    method=self.method, value_shape=value_shape) 
+
+        self._reset_parameters()
+
+        if method == 'discrete':
+            for p in self.sampling_offsets.parameters():
+                p.requires_grad = False
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        init.constant_(self.sampling_offsets.weight, 0)
+        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values
+        grid_init = grid_init.reshape(self.num_heads, 1, 2).tile([1, sum(self.num_points_list), 1])
+        scaling = torch.concat([torch.arange(1, n + 1) for n in self.num_points_list]).reshape(1, -1, 1)
+        grid_init *= scaling
+        self.sampling_offsets.bias.data[...] = grid_init.flatten()
+
+        # attention_weights
+        init.constant_(self.attention_weights.weight, 0)
+        init.constant_(self.attention_weights.bias, 0)
+
+        # proj
+        init.xavier_uniform_(self.value_proj.weight)
+        init.constant_(self.value_proj.bias, 0)
+        init.xavier_uniform_(self.output_proj.weight)
+        init.constant_(self.output_proj.bias, 0)
+
+
+    def forward(self,
+                query: torch.Tensor,
+                reference_points: torch.Tensor,
+                value: torch.Tensor,
+                value_spatial_shapes: List[int],
+                value_mask: torch.Tensor=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value = value * value_mask.to(value.dtype).unsqueeze(-1)
+
+        value = value.reshape(bs, Len_v, self.num_heads, self.head_dim)
+
+        sampling_offsets: torch.Tensor = self.sampling_offsets(query)
+        sampling_offsets = sampling_offsets.reshape(bs, Len_q, self.num_heads, sum(self.num_points_list), 2)
+
+        attention_weights = self.attention_weights(query).reshape(bs, Len_q, self.num_heads, sum(self.num_points_list))
+        attention_weights = F.softmax(attention_weights, dim=-1).reshape(bs, Len_q, self.num_heads, sum(self.num_points_list))
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.num_levels, 1, 2)
+            sampling_locations = reference_points.reshape(bs, Len_q, 1, self.num_levels, 1, 2) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            # reference_points [8, 480, None, 1,  4]
+            # sampling_offsets [8, 480, 8,    12, 2]
+            num_points_scale = self.num_points_scale.to(dtype=query.dtype).unsqueeze(-1)
+            offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * self.offset_scale
+            sampling_locations = reference_points[:, :, None, :, :2] + offset
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        output = self.ms_deformable_attn_core(value, value_spatial_shapes, sampling_locations, attention_weights, self.num_points_list)
+
+        output = self.output_proj(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation='relu',
+                 n_levels=4,
+                 n_points=4,
+                 cross_attn_method='default',
+                 value_shape='default',
+                 ):
+        super(TransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout, batch_first=True)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points, method=cross_attn_method, value_shape=value_shape)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = get_activation(activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+        
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        init.xavier_uniform_(self.linear1.weight)
+        init.xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                target,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(target, query_pos_embed)
+
+        target2, _ = self.self_attn(q, k, value=target, attn_mask=attn_mask)
+        target = target + self.dropout1(target2)
+        target = self.norm1(target)
+
+        # cross attention
+        target2 = self.cross_attn(\
+            self.with_pos_embed(target, query_pos_embed), 
+            reference_points, 
+            memory, 
+            memory_spatial_shapes, 
+            memory_mask)
+        target = target + self.dropout2(target2)
+        target = self.norm2(target)
+
+        # ffn
+        target2 = self.forward_ffn(target)
+        target = target + self.dropout4(target2)
+        target = self.norm3(target)
+
+        return target
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
+        super(TransformerDecoder, self).__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(decoder_layer) for _ in range(num_layers)])
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+
+    def forward(self,
+                target,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                bbox_head,
+                score_head,
+                query_pos_head,
+                attn_mask=None,
+                memory_mask=None):
+        dec_out_bboxes = []
+        dec_out_logits = []
+        ref_points_detach = F.sigmoid(ref_points_unact)
+
+        output = target
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = query_pos_head(ref_points_detach)
+
+            output = layer(output, ref_points_input, memory, memory_spatial_shapes, attn_mask, memory_mask, query_pos_embed)
+
+            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach))
+
+            if self.training:
+                dec_out_logits.append(score_head[i](output))
+                if i == 0:
+                    dec_out_bboxes.append(inter_ref_bbox)
+                else:
+                    dec_out_bboxes.append(F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)))
+
+            elif i == self.eval_idx:
+                dec_out_logits.append(score_head[i](output))
+                dec_out_bboxes.append(inter_ref_bbox)
+                break
+
+            ref_points = inter_ref_bbox
+            ref_points_detach = inter_ref_bbox.detach()
+
+        return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits)
+
+
+@register()
+class RTDETRTransformerv2(nn.Module):
+    __share__ = ['num_classes', 'eval_spatial_size']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=300,
+                 feat_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 num_levels=3,
+                 num_points=4,
+                 nhead=8,
+                 num_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learn_query_content=False,
+                 eval_spatial_size=None,
+                 eval_idx=-1,
+                 eps=1e-2, 
+                 aux_loss=True, 
+                 cross_attn_method='default', 
+                 query_select_method='default',
+                 value_shape='reshape',
+                 mlp_act='relu',
+                 query_pos_method='default',
+                 ):
+        super().__init__()
+        assert len(feat_channels) <= num_levels
+        assert len(feat_strides) == len(feat_channels)
+        
+        for _ in range(num_levels - len(feat_strides)):
+            feat_strides.append(feat_strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.feat_strides = feat_strides
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_layers = num_layers
+        self.eval_spatial_size = eval_spatial_size
+        self.aux_loss = aux_loss
+
+        assert query_select_method in ('default', 'one2many', 'agnostic'), ''
+        assert cross_attn_method in ('default', 'discrete'), ''
+        self.cross_attn_method = cross_attn_method
+        self.query_select_method = query_select_method
+
+        # backbone feature projection
+        self._build_input_proj_layer(feat_channels)
+
+        # Transformer module
+        decoder_layer = TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, \
+            activation, num_levels, num_points, cross_attn_method=cross_attn_method, value_shape=value_shape)
+        self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_layers, eval_idx)
+
+        # denoising
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+        if num_denoising > 0: 
+            self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes)
+            init.normal_(self.denoising_class_embed.weight[:-1])
+
+        # decoder embedding
+        self.learn_query_content = learn_query_content
+        if learn_query_content:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+
+        if query_pos_method == 'as_reg':
+            self.query_pos_head = MLP(4, hidden_dim, hidden_dim, 3, act=mlp_act)
+            print("     ### Query Position Embedding@{} ###     ".format(query_pos_method))
+        else:
+            self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, 2, act=mlp_act)
+
+        # if num_select_queries != self.num_queries:
+        #     layer = TransformerEncoderLayer(hidden_dim, nhead, dim_feedforward, activation='gelu')
+        #     self.encoder = TransformerEncoder(layer, 1)
+
+        self.enc_output = nn.Sequential(OrderedDict([
+            ('proj', nn.Linear(hidden_dim, hidden_dim)),
+            ('norm', nn.LayerNorm(hidden_dim,)),
+        ]))
+
+        if query_select_method == 'agnostic':
+            self.enc_score_head = nn.Linear(hidden_dim, 1)
+        else:
+            self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3, act=mlp_act)
+
+        # decoder head
+        self.dec_score_head = nn.ModuleList([
+            nn.Linear(hidden_dim, num_classes) for _ in range(num_layers)
+        ])
+        self.dec_bbox_head = nn.ModuleList([
+            MLP(hidden_dim, hidden_dim, 4, 3, act=mlp_act) for _ in range(num_layers)
+        ])
+
+        # init encoder output anchors and valid_mask
+        if self.eval_spatial_size:
+            anchors, valid_mask = self._generate_anchors()
+            self.register_buffer('anchors', anchors)
+            self.register_buffer('valid_mask', valid_mask)
+
+        self._reset_parameters()
+        
+    def _reset_parameters(self):
+        bias = bias_init_with_prob(0.01)
+        init.constant_(self.enc_score_head.bias, bias)
+        init.constant_(self.enc_bbox_head.layers[-1].weight, 0)
+        init.constant_(self.enc_bbox_head.layers[-1].bias, 0)
+
+        for _cls, _reg in zip(self.dec_score_head, self.dec_bbox_head):
+            init.constant_(_cls.bias, bias)
+            init.constant_(_reg.layers[-1].weight, 0)
+            init.constant_(_reg.layers[-1].bias, 0)
+        
+        init.xavier_uniform_(self.enc_output[0].weight)
+        if self.learn_query_content:
+            init.xavier_uniform_(self.tgt_embed.weight)
+        init.xavier_uniform_(self.query_pos_head.layers[0].weight)
+        init.xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for m in self.input_proj:
+            init.xavier_uniform_(m[0].weight)
+
+    def _build_input_proj_layer(self, feat_channels):
+        self.input_proj = nn.ModuleList()
+        for in_channels in feat_channels:
+            self.input_proj.append(
+                nn.Sequential(OrderedDict([
+                    ('conv', nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)), 
+                    ('norm', nn.BatchNorm2d(self.hidden_dim,))])
+                )
+            )
+
+        in_channels = feat_channels[-1]
+
+        for _ in range(self.num_levels - len(feat_channels)):
+            self.input_proj.append(
+                nn.Sequential(OrderedDict([
+                    ('conv', nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)),
+                    ('norm', nn.BatchNorm2d(self.hidden_dim))])
+                )
+            )
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats: List[torch.Tensor]):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        for i, feat in enumerate(proj_feats):
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
+            # [num_levels, 2]
+            spatial_shapes.append([h, w])
+        # [b, l, c]
+        feat_flatten = torch.concat(feat_flatten, 1)
+        return feat_flatten, spatial_shapes
+
+    def _generate_anchors(self,
+                          spatial_shapes=None,
+                          grid_size=0.05,
+                          dtype=torch.float32,
+                          device='cpu'):
+        if spatial_shapes is None:
+            spatial_shapes = []
+            eval_h, eval_w = self.eval_spatial_size
+            for s in self.feat_strides:
+                spatial_shapes.append([int(eval_h / s), int(eval_w / s)])
+
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing='ij')
+            grid_xy = torch.stack([grid_x, grid_y], dim=-1)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor([w, h], dtype=dtype)
+            wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl)
+            lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4)
+            anchors.append(lvl_anchors)
+
+        anchors = torch.concat(anchors, dim=1).to(device)
+        valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True)
+        anchors = torch.log(anchors / (1 - anchors))
+        anchors = torch.where(valid_mask, anchors, torch.inf)
+
+        return anchors, valid_mask
+
+
+    def _get_decoder_input(self,
+                           memory: torch.Tensor,
+                           spatial_shapes,
+                           denoising_logits=None,
+                           denoising_bbox_unact=None):
+
+        # prepare input for decoder
+        if self.training or self.eval_spatial_size is None:
+            anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device)
+        else:
+            anchors = self.anchors
+            valid_mask = self.valid_mask
+
+        # memory = torch.where(valid_mask, memory, 0)
+        # TODO fix type error for onnx export 
+        memory = valid_mask.to(memory.dtype) * memory  
+
+        output_memory :torch.Tensor = self.enc_output(memory)
+        enc_outputs_logits :torch.Tensor = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact :torch.Tensor = self.enc_bbox_head(output_memory) + anchors
+
+        enc_topk_bboxes_list, enc_topk_logits_list = [], []
+        enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = \
+            self._select_topk(output_memory, enc_outputs_logits, enc_outputs_coord_unact, self.num_queries)
+            
+        if self.training:
+            enc_topk_bboxes = F.sigmoid(enc_topk_bbox_unact)
+            enc_topk_bboxes_list.append(enc_topk_bboxes)
+            enc_topk_logits_list.append(enc_topk_logits)
+
+        # if self.num_select_queries != self.num_queries:            
+        #     raise NotImplementedError('')
+
+        if self.learn_query_content:
+            content = self.tgt_embed.weight.unsqueeze(0).tile([memory.shape[0], 1, 1])
+        else:
+            content = enc_topk_memory.detach()
+            
+        enc_topk_bbox_unact = enc_topk_bbox_unact.detach()
+        
+        if denoising_bbox_unact is not None:
+            enc_topk_bbox_unact = torch.concat([denoising_bbox_unact, enc_topk_bbox_unact], dim=1)
+            content = torch.concat([denoising_logits, content], dim=1)
+        
+        return content, enc_topk_bbox_unact, enc_topk_bboxes_list, enc_topk_logits_list
+
+    def _select_topk(self, memory: torch.Tensor, outputs_logits: torch.Tensor, outputs_coords_unact: torch.Tensor, topk: int):
+        if self.query_select_method == 'default':
+            _, topk_ind = torch.topk(outputs_logits.max(-1).values, topk, dim=-1)
+
+        elif self.query_select_method == 'one2many':
+            _, topk_ind = torch.topk(outputs_logits.flatten(1), topk, dim=-1)
+            topk_ind = topk_ind // self.num_classes
+
+        elif self.query_select_method == 'agnostic':
+            _, topk_ind = torch.topk(outputs_logits.squeeze(-1), topk, dim=-1)
+        
+        topk_ind: torch.Tensor
+
+        topk_coords = outputs_coords_unact.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_coords_unact.shape[-1]))
+        
+        topk_logits = outputs_logits.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_logits.shape[-1]))
+        
+        topk_memory = memory.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, memory.shape[-1]))
+
+        return topk_memory, topk_logits, topk_coords
+
+
+    def forward(self, feats, targets=None):
+        # input projection and embedding
+        memory, spatial_shapes = self._get_encoder_input(feats)
+        
+        # prepare denoising training
+        if self.training and self.num_denoising > 0:
+            denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(targets, \
+                    self.num_classes, 
+                    self.num_queries, 
+                    self.denoising_class_embed, 
+                    num_denoising=self.num_denoising, 
+                    label_noise_ratio=self.label_noise_ratio, 
+                    box_noise_scale=self.box_noise_scale, )
+        else:
+            denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        init_ref_contents, init_ref_points_unact, enc_topk_bboxes_list, enc_topk_logits_list = \
+            self._get_decoder_input(memory, spatial_shapes, denoising_logits, denoising_bbox_unact)
+
+        # decoder
+        out_bboxes, out_logits = self.decoder(
+            init_ref_contents,
+            init_ref_points_unact,
+            memory,
+            spatial_shapes,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            attn_mask=attn_mask)
+
+        if self.training and dn_meta is not None:
+            dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2)
+            dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2)
+
+        out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]}
+
+        if self.training and self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1])
+            out['enc_aux_outputs'] = self._set_aux_loss(enc_topk_logits_list, enc_topk_bboxes_list)
+            out['enc_meta'] = {'class_agnostic': self.query_select_method == 'agnostic'}
+
+            if dn_meta is not None:
+                out['dn_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes)
+                out['dn_meta'] = dn_meta
+
+        return out
+
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_boxes': b}
+                for a, b in zip(outputs_class, outputs_coord)]
\ No newline at end of file
diff --git a/engine/deim/utils.py b/deim/_engine/deim/utils.py
similarity index 100%
rename from engine/deim/utils.py
rename to deim/_engine/deim/utils.py
diff --git a/deim/_engine/deim_dataset_sides.yml b/deim/_engine/deim_dataset_sides.yml
new file mode 100644
index 00000000..2b2cd49c
--- /dev/null
+++ b/deim/_engine/deim_dataset_sides.yml
@@ -0,0 +1,41 @@
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ["bbox"]
+
+num_classes: 2 # your dataset classes + 1 (background) For an init train add 1 class and remap false.
+# for subsequent training correct num classes and remap true
+remap_mscoco_category: True
+
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /home/hidara/Documents/datasets/yolo_dataset_sides/train/images
+    ann_file: /home/hidara/Documents/datasets/yolo_dataset_sides/annotations/instances_train.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 8
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /home/hidara/Documents/datasets/yolo_dataset_sides/val/images
+    ann_file: /home/hidara/Documents/datasets/yolo_dataset_sides/annotations/instances_val.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 8
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
diff --git a/deim/_engine/deim_dataset_under.yml b/deim/_engine/deim_dataset_under.yml
new file mode 100644
index 00000000..c0dcd4cf
--- /dev/null
+++ b/deim/_engine/deim_dataset_under.yml
@@ -0,0 +1,42 @@
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ["bbox"]
+
+num_classes: 1 # your dataset classes + 1 (background) For an init train add 1 class and remap false.
+# for subsequent training correct num classes and remap true
+remap_mscoco_category: True
+
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /home/hidara/Documents/datasets/yolo_dataset_under/train/images
+    ann_file: /home/hidara/Documents/datasets/yolo_dataset_under/annotations/instances_train.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 8
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /home/hidara/Documents/datasets/yolo_dataset_under/val/images
+    ann_file: /home/hidara/Documents/datasets/yolo_dataset_under/annotations/instances_val.json
+                                             
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 8
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
diff --git a/deim/_engine/engine/__init__.py b/deim/_engine/engine/__init__.py
new file mode 100644
index 00000000..69baa01f
--- /dev/null
+++ b/deim/_engine/engine/__init__.py
@@ -0,0 +1,16 @@
+"""
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+"""
+
+# for register purpose
+from . import optim
+from . import data
+from . import deim
+
+from .backbone import *
+
+from .backbone import (
+    get_activation,
+    FrozenBatchNorm2d,
+    freeze_batch_norm2d,
+)
\ No newline at end of file
diff --git a/engine/backbone/__init__.py b/deim/_engine/engine/backbone/__init__.py
similarity index 100%
rename from engine/backbone/__init__.py
rename to deim/_engine/engine/backbone/__init__.py
diff --git a/deim/_engine/engine/backbone/common.py b/deim/_engine/engine/backbone/common.py
new file mode 100644
index 00000000..bcbe0313
--- /dev/null
+++ b/deim/_engine/engine/backbone/common.py
@@ -0,0 +1,116 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+
+
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size,
+            stride,
+            padding=(kernel_size-1)//2 if padding is None else padding,
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act)
+
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+
+
+class FrozenBatchNorm2d(nn.Module):
+    """copy and modified from https://github.com/facebookresearch/detr/blob/master/models/backbone.py
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, num_features, eps=1e-5):
+        super(FrozenBatchNorm2d, self).__init__()
+        n = num_features
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+        self.eps = eps
+        self.num_features = n
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        scale = w * (rv + self.eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+    def extra_repr(self):
+        return (
+            "{num_features}, eps={eps}".format(**self.__dict__)
+        )
+
+def freeze_batch_norm2d(module: nn.Module) -> nn.Module:
+    if isinstance(module, nn.BatchNorm2d):
+        module = FrozenBatchNorm2d(module.num_features)
+    else:
+        for name, child in module.named_children():
+            _child = freeze_batch_norm2d(child)
+            if _child is not child:
+                setattr(module, name, _child)
+    return module
+
+
+def get_activation(act: str, inplace: bool=True):
+    """get activation
+    """
+    if act is None:
+        return nn.Identity()
+
+    elif isinstance(act, nn.Module):
+        return act
+
+    act = act.lower()
+
+    if act == 'silu' or act == 'swish':
+        m = nn.SiLU()
+
+    elif act == 'relu':
+        m = nn.ReLU()
+
+    elif act == 'leaky_relu':
+        m = nn.LeakyReLU()
+
+    elif act == 'silu':
+        m = nn.SiLU()
+
+    elif act == 'gelu':
+        m = nn.GELU()
+
+    elif act == 'hardsigmoid':
+        m = nn.Hardsigmoid()
+
+    else:
+        raise RuntimeError('')
+
+    if hasattr(m, 'inplace'):
+        m.inplace = inplace
+
+    return m
diff --git a/engine/backbone/csp_darknet.py b/deim/_engine/engine/backbone/csp_darknet.py
similarity index 100%
rename from engine/backbone/csp_darknet.py
rename to deim/_engine/engine/backbone/csp_darknet.py
diff --git a/deim/_engine/engine/backbone/csp_resnet.py b/deim/_engine/engine/backbone/csp_resnet.py
new file mode 100644
index 00000000..2d22b28c
--- /dev/null
+++ b/deim/_engine/engine/backbone/csp_resnet.py
@@ -0,0 +1,277 @@
+"""
+https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.6/ppdet/modeling/backbones/cspresnet.py
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from collections import OrderedDict
+
+from .common import get_activation
+
+from ..core import register
+
+__all__ = ['CSPResNet']
+
+
+donwload_url = {
+    's': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_s_pretrained_from_paddle.pth',
+    'm': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_m_pretrained_from_paddle.pth',
+    'l': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_l_pretrained_from_paddle.pth',
+    'x': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_x_pretrained_from_paddle.pth',
+}
+
+
+class ConvBNLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, filter_size=3, stride=1, groups=1, padding=0, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(ch_in, ch_out, filter_size, stride, padding, groups=groups, bias=False)
+        self.bn = nn.BatchNorm2d(ch_out)
+        self.act = get_activation(act)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+class RepVggBlock(nn.Module):
+    def __init__(self, ch_in, ch_out, act='relu', alpha: bool=False):
+        super().__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=None)
+        self.conv2 = ConvBNLayer(
+            ch_in, ch_out, 1, stride=1, padding=0, act=None)
+        self.act = get_activation(act)
+
+        if alpha:
+            self.alpha = nn.Parameter(torch.ones(1, ))
+        else:
+            self.alpha = None
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            if self.alpha:
+                y = self.conv1(x) + self.alpha * self.conv2(x)
+            else:
+                y = self.conv1(x) + self.conv2(x)
+        y = self.act(y)
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1)
+
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+
+        if self.alpha:
+            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + self.alpha * bias1x1
+        else:
+            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return F.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch: ConvBNLayer):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.norm.running_mean
+        running_var = branch.norm.running_var
+        gamma = branch.norm.weight
+        beta = branch.norm.bias
+        eps = branch.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class BasicBlock(nn.Module):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 act='relu',
+                 shortcut=True,
+                 use_alpha=False):
+        super().__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class EffectiveSELayer(nn.Module):
+    """ Effective Squeeze-Excitation
+    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+    """
+
+    def __init__(self, channels, act='hardsigmoid'):
+        super(EffectiveSELayer, self).__init__()
+        self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
+        self.act = get_activation(act)
+
+    def forward(self, x: torch.Tensor):
+        x_se = x.mean((2, 3), keepdim=True)
+        x_se = self.fc(x_se)
+        x_se = self.act(x_se)
+        return x * x_se
+
+
+class CSPResStage(nn.Module):
+    def __init__(self,
+                 block_fn,
+                 ch_in,
+                 ch_out,
+                 n,
+                 stride,
+                 act='relu',
+                 attn='eca',
+                 use_alpha=False):
+        super().__init__()
+        ch_mid = (ch_in + ch_out) // 2
+        if stride == 2:
+            self.conv_down = ConvBNLayer(
+                ch_in, ch_mid, 3, stride=2, padding=1, act=act)
+        else:
+            self.conv_down = None
+        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.blocks = nn.Sequential(*[
+            block_fn(
+                ch_mid // 2,
+                ch_mid // 2,
+                act=act,
+                shortcut=True,
+                use_alpha=use_alpha) for i in range(n)
+        ])
+        if attn:
+            self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
+        else:
+            self.attn = None
+
+        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        y1 = self.conv1(x)
+        y2 = self.blocks(self.conv2(x))
+        y = torch.concat([y1, y2], dim=1)
+        if self.attn is not None:
+            y = self.attn(y)
+        y = self.conv3(y)
+        return y
+
+
+@register()
+class CSPResNet(nn.Module):
+    layers = [3, 6, 6, 3]
+    channels = [64, 128, 256, 512, 1024]
+    model_cfg = {
+        's': {'depth_mult': 0.33, 'width_mult': 0.50, },
+        'm': {'depth_mult': 0.67, 'width_mult': 0.75, },
+        'l': {'depth_mult': 1.00, 'width_mult': 1.00, },
+        'x': {'depth_mult': 1.33, 'width_mult': 1.25, },
+    }
+
+    def __init__(self,
+                 name: str,
+                 act='silu',
+                 return_idx=[1, 2, 3],
+                 use_large_stem=True,
+                 use_alpha=False,
+                 pretrained=False):
+
+        super().__init__()
+        depth_mult = self.model_cfg[name]['depth_mult']
+        width_mult = self.model_cfg[name]['width_mult']
+
+        channels = [max(round(c * width_mult), 1) for c in self.channels]
+        layers = [max(round(l * depth_mult), 1) for l in self.layers]
+        act = get_activation(act)
+
+        if use_large_stem:
+            self.stem = nn.Sequential(OrderedDict([
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0] // 2,
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act)), ('conv3', ConvBNLayer(
+                        channels[0] // 2,
+                        channels[0],
+                        3,
+                        stride=1,
+                        padding=1,
+                        act=act))]))
+        else:
+            self.stem = nn.Sequential(OrderedDict([
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0],
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act))]))
+
+        n = len(channels) - 1
+        self.stages = nn.Sequential(OrderedDict([(str(i), CSPResStage(
+            BasicBlock,
+            channels[i],
+            channels[i + 1],
+            layers[i],
+            2,
+            act=act,
+            use_alpha=use_alpha)) for i in range(n)]))
+
+        self._out_channels = channels[1:]
+        self._out_strides = [4 * 2**i for i in range(n)]
+        self.return_idx = return_idx
+
+        if pretrained:
+            if isinstance(pretrained, bool) or 'http' in pretrained:
+                state = torch.hub.load_state_dict_from_url(donwload_url[name], map_location='cpu')
+            else:
+                state = torch.load(pretrained, map_location='cpu')
+            self.load_state_dict(state)
+            print(f'Load CSPResNet_{name} state_dict')
+
+    def forward(self, x):
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+
+        return outs
diff --git a/engine/backbone/hgnetv2.py b/deim/_engine/engine/backbone/hgnetv2.py
similarity index 100%
rename from engine/backbone/hgnetv2.py
rename to deim/_engine/engine/backbone/hgnetv2.py
diff --git a/engine/backbone/presnet.py b/deim/_engine/engine/backbone/presnet.py
similarity index 100%
rename from engine/backbone/presnet.py
rename to deim/_engine/engine/backbone/presnet.py
diff --git a/engine/backbone/test_resnet.py b/deim/_engine/engine/backbone/test_resnet.py
similarity index 100%
rename from engine/backbone/test_resnet.py
rename to deim/_engine/engine/backbone/test_resnet.py
diff --git a/engine/backbone/timm_model.py b/deim/_engine/engine/backbone/timm_model.py
similarity index 100%
rename from engine/backbone/timm_model.py
rename to deim/_engine/engine/backbone/timm_model.py
diff --git a/engine/backbone/torchvision_model.py b/deim/_engine/engine/backbone/torchvision_model.py
similarity index 100%
rename from engine/backbone/torchvision_model.py
rename to deim/_engine/engine/backbone/torchvision_model.py
diff --git a/deim/_engine/engine/backbone/utils.py b/deim/_engine/engine/backbone/utils.py
new file mode 100644
index 00000000..7c4e0901
--- /dev/null
+++ b/deim/_engine/engine/backbone/utils.py
@@ -0,0 +1,54 @@
+"""
+https://github.com/pytorch/vision/blob/main/torchvision/models/_utils.py
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from collections import OrderedDict
+from typing import Dict, List
+
+
+import torch.nn as nn
+
+
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+    """
+
+    _version = 3
+
+    def __init__(self, model: nn.Module, return_layers: List[str]) -> None:
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model. {}"\
+                .format([name for name, _ in model.named_children()]))
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(k)  for k in return_layers}
+        layers = OrderedDict()
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        outputs = []
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                outputs.append(x)
+
+        return outputs
diff --git a/engine/core/__init__.py b/deim/_engine/engine/core/__init__.py
similarity index 100%
rename from engine/core/__init__.py
rename to deim/_engine/engine/core/__init__.py
diff --git a/engine/core/_config.py b/deim/_engine/engine/core/_config.py
similarity index 100%
rename from engine/core/_config.py
rename to deim/_engine/engine/core/_config.py
diff --git a/deim/_engine/engine/core/workspace.py b/deim/_engine/engine/core/workspace.py
new file mode 100644
index 00000000..d3e87a08
--- /dev/null
+++ b/deim/_engine/engine/core/workspace.py
@@ -0,0 +1,180 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import inspect
+import importlib
+import functools
+from collections import defaultdict
+from typing import Any, Dict, Optional, List
+
+
+GLOBAL_CONFIG = defaultdict(dict)
+
+
+def register(dct :Any=GLOBAL_CONFIG, name=None, force=False):
+    """
+        dct:
+            if dct is Dict, register foo into dct as key-value pair
+            if dct is Clas, register as modules attibute
+        force
+            whether force register.
+    """
+    def decorator(foo):
+        register_name = foo.__name__ if name is None else name
+        if not force:
+            if inspect.isclass(dct):
+                assert not hasattr(dct, foo.__name__), \
+                    f'module {dct.__name__} has {foo.__name__}'
+            else:
+                assert foo.__name__ not in dct, \
+                f'{foo.__name__} has been already registered'
+
+        if inspect.isfunction(foo):
+            @functools.wraps(foo)
+            def wrap_func(*args, **kwargs):
+                return foo(*args, **kwargs)
+            if isinstance(dct, dict):
+                dct[foo.__name__] = wrap_func
+            elif inspect.isclass(dct):
+                setattr(dct, foo.__name__, wrap_func)
+            else:
+                raise AttributeError('')
+            return wrap_func
+
+        elif inspect.isclass(foo):
+            dct[register_name] = extract_schema(foo)
+
+        else:
+            raise ValueError(f'Do not support {type(foo)} register')
+
+        return foo
+
+    return decorator
+
+
+
+def extract_schema(module: type):
+    """
+    Args:
+        module (type),
+    Return:
+        Dict,
+    """
+    argspec = inspect.getfullargspec(module.__init__)
+    arg_names = [arg for arg in argspec.args if arg != 'self']
+    num_defualts = len(argspec.defaults) if argspec.defaults is not None else 0
+    num_requires = len(arg_names) - num_defualts
+
+    schame = dict()
+    schame['_name'] = module.__name__
+    schame['_pymodule'] = importlib.import_module(module.__module__)
+    schame['_inject'] = getattr(module, '__inject__', [])
+    schame['_share'] = getattr(module, '__share__', [])
+    schame['_kwargs'] = {}
+    for i, name in enumerate(arg_names):
+        if name in schame['_share']:
+            assert i >= num_requires, 'share config must have default value.'
+            value = argspec.defaults[i - num_requires]
+
+        elif i >= num_requires:
+            value = argspec.defaults[i - num_requires]
+
+        else:
+            value = None
+
+        schame[name] = value
+        schame['_kwargs'][name] = value
+
+    return schame
+
+
+def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs):
+    """
+    """
+    assert type(type_or_name) in (type, str), 'create should be modules or name.'
+
+    name = type_or_name if isinstance(type_or_name, str) else type_or_name.__name__
+
+    if name in global_cfg:
+        if hasattr(global_cfg[name], '__dict__'):
+            return global_cfg[name]
+    else:
+        raise ValueError('The module {} is not registered'.format(name))
+
+    cfg = global_cfg[name]
+
+    if isinstance(cfg, dict) and 'type' in cfg:
+        _cfg: dict = global_cfg[cfg['type']]
+        # clean args
+        _keys = [k for k in _cfg.keys() if not k.startswith('_')]
+        for _arg in _keys:
+            del _cfg[_arg]
+        _cfg.update(_cfg['_kwargs']) # restore default args
+        _cfg.update(cfg) # load config args
+        _cfg.update(kwargs) # TODO recive extra kwargs
+        name = _cfg.pop('type') # pop extra key `type` (from cfg)
+
+        return create(name, global_cfg)
+
+    module = getattr(cfg['_pymodule'], name)
+    module_kwargs = {}
+    module_kwargs.update(cfg)
+
+    # shared var
+    for k in cfg['_share']:
+        if k in global_cfg:
+            module_kwargs[k] = global_cfg[k]
+        else:
+            module_kwargs[k] = cfg[k]
+
+    # inject
+    for k in cfg['_inject']:
+        _k = cfg[k]
+
+        if _k is None:
+            continue
+
+        if isinstance(_k, str):
+            if _k not in global_cfg:
+                raise ValueError(f'Missing inject config of {_k}.')
+
+            _cfg = global_cfg[_k]
+
+            if isinstance(_cfg, dict):
+                module_kwargs[k] = create(_cfg['_name'], global_cfg)
+            else:
+                module_kwargs[k] = _cfg
+
+        elif isinstance(_k, dict):
+            if 'type' not in _k.keys():
+                raise ValueError('Missing inject for `type` style.')
+
+            _type = str(_k['type'])
+            if _type not in global_cfg:
+                raise ValueError(f'Missing {_type} in inspect stage.')
+
+            # TODO
+            _cfg: dict = global_cfg[_type]
+            # clean args
+            _keys = [k for k in _cfg.keys() if not k.startswith('_')]
+            for _arg in _keys:
+                del _cfg[_arg]
+            _cfg.update(_cfg['_kwargs']) # restore default values
+            _cfg.update(_k) # load config args
+            name = _cfg.pop('type') # pop extra key (`type` from _k)
+            module_kwargs[k] = create(name, global_cfg)
+
+        else:
+            raise ValueError(f'Inject does not support {_k}')
+
+    # TODO hard code
+    module_kwargs = {k: v for k, v in module_kwargs.items() if not k.startswith('_')}
+
+    # TODO for **kwargs
+    # extra_args = set(module_kwargs.keys()) - set(arg_names)
+    # if len(extra_args) > 0:
+    #     raise RuntimeError(f'Error: unknown args {extra_args} for {module}')
+
+    return module(**module_kwargs)
diff --git a/engine/core/yaml_config.py b/deim/_engine/engine/core/yaml_config.py
similarity index 97%
rename from engine/core/yaml_config.py
rename to deim/_engine/engine/core/yaml_config.py
index bdd27b41..236f2995 100644
--- a/engine/core/yaml_config.py
+++ b/deim/_engine/engine/core/yaml_config.py
@@ -7,9 +7,10 @@
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import DataLoader
-
+from datetime import datetime as dt
 import re
 import copy
+from pathlib import Path
 
 from ._config import BaseConfig
 from .workspace import create
@@ -21,7 +22,8 @@ def __init__(self, cfg_path: str, **kwargs) -> None:
 
         cfg = load_config(cfg_path)
         cfg = merge_dict(cfg, kwargs)
-
+        ts = dt.now().strftime("%Y%m%d_%H%M%S")
+        cfg['output_dir'] = f"{cfg['output_dir']}/{ts}"
         self.yaml_cfg = copy.deepcopy(cfg)
 
         for k in super().__dict__:
diff --git a/engine/core/yaml_utils.py b/deim/_engine/engine/core/yaml_utils.py
similarity index 100%
rename from engine/core/yaml_utils.py
rename to deim/_engine/engine/core/yaml_utils.py
diff --git a/engine/data/__init__.py b/deim/_engine/engine/data/__init__.py
similarity index 100%
rename from engine/data/__init__.py
rename to deim/_engine/engine/data/__init__.py
diff --git a/deim/_engine/engine/data/_misc.py b/deim/_engine/engine/data/_misc.py
new file mode 100644
index 00000000..22c333fe
--- /dev/null
+++ b/deim/_engine/engine/data/_misc.py
@@ -0,0 +1,56 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import importlib.metadata
+from torch import Tensor
+
+if '0.15.2' in importlib.metadata.version('torchvision'):
+    import torchvision
+    torchvision.disable_beta_transforms_warning()
+
+    from torchvision.datapoints import BoundingBox as BoundingBoxes
+    from torchvision.datapoints import BoundingBoxFormat, Mask, Image, Video
+    from torchvision.transforms.v2 import SanitizeBoundingBox as SanitizeBoundingBoxes
+    _boxes_keys = ['format', 'spatial_size']
+
+elif '0.17' > importlib.metadata.version('torchvision') >= '0.16':
+    import torchvision
+    torchvision.disable_beta_transforms_warning()
+
+    from torchvision.transforms.v2 import SanitizeBoundingBoxes
+    from torchvision.tv_tensors import (
+        BoundingBoxes, BoundingBoxFormat, Mask, Image, Video)
+    _boxes_keys = ['format', 'canvas_size']
+
+elif importlib.metadata.version('torchvision') >= '0.17':
+    import torchvision
+    from torchvision.transforms.v2 import SanitizeBoundingBoxes
+    from torchvision.tv_tensors import (
+        BoundingBoxes, BoundingBoxFormat, Mask, Image, Video)
+    _boxes_keys = ['format', 'canvas_size']
+
+else:
+    raise RuntimeError('Please make sure torchvision version >= 0.15.2')
+
+
+
+def convert_to_tv_tensor(tensor: Tensor, key: str, box_format='xyxy', spatial_size=None) -> Tensor:
+    """
+    Args:
+        tensor (Tensor): input tensor
+        key (str): transform to key
+
+    Return:
+        Dict[str, TV_Tensor]
+    """
+    assert key in ('boxes', 'masks', ), "Only support 'boxes' and 'masks'"
+
+    if key == 'boxes':
+        box_format = getattr(BoundingBoxFormat, box_format.upper())
+        _kwargs = dict(zip(_boxes_keys, [box_format, spatial_size]))
+        return BoundingBoxes(tensor, **_kwargs)
+
+    if key == 'masks':
+       return Mask(tensor)
diff --git a/engine/data/dataloader.py b/deim/_engine/engine/data/dataloader.py
similarity index 100%
rename from engine/data/dataloader.py
rename to deim/_engine/engine/data/dataloader.py
diff --git a/engine/data/dataset/__init__.py b/deim/_engine/engine/data/dataset/__init__.py
similarity index 100%
rename from engine/data/dataset/__init__.py
rename to deim/_engine/engine/data/dataset/__init__.py
diff --git a/deim/_engine/engine/data/dataset/_dataset.py b/deim/_engine/engine/data/dataset/_dataset.py
new file mode 100644
index 00000000..66784a86
--- /dev/null
+++ b/deim/_engine/engine/data/dataset/_dataset.py
@@ -0,0 +1,24 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.utils.data as data
+
+class DetDataset(data.Dataset):
+    def __getitem__(self, index):
+        img, target = self.load_item(index)
+        if self.transforms is not None:
+            img, target, _ = self.transforms(img, target, self)
+        return img, target
+
+    def load_item(self, index):
+        raise NotImplementedError("Please implement this function to return item before `transforms`.")
+
+    def set_epoch(self, epoch) -> None:
+        self._epoch = epoch
+
+    @property
+    def epoch(self):
+        return self._epoch if hasattr(self, '_epoch') else -1
diff --git a/engine/data/dataset/coco_dataset.py b/deim/_engine/engine/data/dataset/coco_dataset.py
similarity index 100%
rename from engine/data/dataset/coco_dataset.py
rename to deim/_engine/engine/data/dataset/coco_dataset.py
diff --git a/engine/data/dataset/coco_eval.py b/deim/_engine/engine/data/dataset/coco_eval.py
similarity index 100%
rename from engine/data/dataset/coco_eval.py
rename to deim/_engine/engine/data/dataset/coco_eval.py
diff --git a/deim/_engine/engine/data/dataset/coco_utils.py b/deim/_engine/engine/data/dataset/coco_utils.py
new file mode 100644
index 00000000..e8f38d44
--- /dev/null
+++ b/deim/_engine/engine/data/dataset/coco_utils.py
@@ -0,0 +1,192 @@
+"""
+copy and modified https://github.com/pytorch/vision/blob/main/references/detection/coco_utils.py
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch
+import torch.utils.data
+import torchvision
+import torchvision.transforms.functional as TVF
+import faster_coco_eval.core.mask as coco_mask
+from faster_coco_eval import COCO
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask:
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        segmentations = [obj["segmentation"] for obj in anno]
+        masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        return image, target
+
+
+def _coco_remove_images_without_annotations(dataset, cat_list=None):
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _count_visible_keypoints(anno):
+        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+    min_keypoints_per_image = 10
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+        # keypoints task have a slight different criteria for considering
+        # if an annotation is valid
+        if "keypoints" not in anno[0]:
+            return True
+        # for keypoint detection tasks, only consider valid images those
+        # containing at least min_keypoints_per_image
+        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+            return True
+        return False
+
+    ids = []
+    for ds_idx, img_id in enumerate(dataset.ids):
+        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.coco.loadAnns(ann_ids)
+        if cat_list:
+            anno = [obj for obj in anno if obj["category_id"] in cat_list]
+        if _has_valid_annotation(anno):
+            ids.append(ds_idx)
+
+    dataset = torch.utils.data.Subset(dataset, ids)
+    return dataset
+
+
+def convert_to_coco_api(ds):
+    coco_ds = COCO()
+    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
+    ann_id = 1
+    dataset = {"images": [], "categories": [], "annotations": []}
+    categories = set()
+    for img_idx in range(len(ds)):
+        # find better way to get target
+        # targets = ds.get_annotations(img_idx)
+        # img, targets = ds[img_idx]
+
+        img, targets = ds.load_item(img_idx)
+        width, height = img.size
+
+        image_id = targets["image_id"].item()
+        img_dict = {}
+        img_dict["id"] = image_id
+        img_dict["width"] = width
+        img_dict["height"] = height
+        dataset["images"].append(img_dict)
+        bboxes = targets["boxes"].clone()
+        bboxes[:, 2:] -= bboxes[:, :2] # xyxy -> xywh
+        bboxes = bboxes.tolist()
+        labels = targets["labels"].tolist()
+        areas = targets["area"].tolist()
+        iscrowd = targets["iscrowd"].tolist()
+        if "masks" in targets:
+            masks = targets["masks"]
+            # make masks Fortran contiguous for coco_mask
+            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        if "keypoints" in targets:
+            keypoints = targets["keypoints"]
+            keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
+        num_objs = len(bboxes)
+        for i in range(num_objs):
+            ann = {}
+            ann["image_id"] = image_id
+            ann["bbox"] = bboxes[i]
+            ann["category_id"] = labels[i]
+            categories.add(labels[i])
+            ann["area"] = areas[i]
+            ann["iscrowd"] = iscrowd[i]
+            ann["id"] = ann_id
+            if "masks" in targets:
+                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
+            if "keypoints" in targets:
+                ann["keypoints"] = keypoints[i]
+                ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3])
+            dataset["annotations"].append(ann)
+            ann_id += 1
+    dataset["categories"] = [{"id": i} for i in sorted(categories)]
+    coco_ds.dataset = dataset
+    coco_ds.createIndex()
+    return coco_ds
+
+
+def get_coco_api_from_dataset(dataset):
+    # FIXME: This is... awful?
+    for _ in range(10):
+        if isinstance(dataset, torchvision.datasets.CocoDetection):
+            break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+    return convert_to_coco_api(dataset)
diff --git a/engine/data/dataset/voc_detection.py b/deim/_engine/engine/data/dataset/voc_detection.py
similarity index 100%
rename from engine/data/dataset/voc_detection.py
rename to deim/_engine/engine/data/dataset/voc_detection.py
diff --git a/deim/_engine/engine/data/dataset/voc_eval.py b/deim/_engine/engine/data/dataset/voc_eval.py
new file mode 100644
index 00000000..0bee50ae
--- /dev/null
+++ b/deim/_engine/engine/data/dataset/voc_eval.py
@@ -0,0 +1,12 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torchvision
+
+
+class VOCEvaluator(object):
+    def __init__(self) -> None:
+        pass
diff --git a/engine/data/transforms/__init__.py b/deim/_engine/engine/data/transforms/__init__.py
similarity index 100%
rename from engine/data/transforms/__init__.py
rename to deim/_engine/engine/data/transforms/__init__.py
diff --git a/deim/_engine/engine/data/transforms/_transforms.py b/deim/_engine/engine/data/transforms/_transforms.py
new file mode 100644
index 00000000..2b1dd692
--- /dev/null
+++ b/deim/_engine/engine/data/transforms/_transforms.py
@@ -0,0 +1,184 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+
+import torchvision
+import torchvision.transforms.v2 as T
+import torchvision.transforms.v2.functional as F
+
+import PIL
+import PIL.Image
+
+from typing import Any, Dict, List, Optional
+
+from .._misc import convert_to_tv_tensor, _boxes_keys
+from .._misc import Image, Video, Mask, BoundingBoxes
+from .._misc import SanitizeBoundingBoxes
+
+from ...core import register
+torchvision.disable_beta_transforms_warning()
+
+
+RandomPhotometricDistort = register()(T.RandomPhotometricDistort)
+RandomZoomOut = register()(T.RandomZoomOut)
+RandomHorizontalFlip = register()(T.RandomHorizontalFlip)
+Resize = register()(T.Resize)
+# ToImageTensor = register()(T.ToImageTensor)
+# ConvertDtype = register()(T.ConvertDtype)
+# PILToTensor = register()(T.PILToTensor)
+SanitizeBoundingBoxes = register(name='SanitizeBoundingBoxes')(SanitizeBoundingBoxes)
+RandomCrop = register()(T.RandomCrop)
+Normalize = register()(T.Normalize)
+
+# Thermal-specific transforms for mining environment
+# Note: GaussianBlur and RandomRotation need wrapping to add 'p' parameter support
+# RandomPerspective and RandomAdjustSharpness already support 'p' natively
+
+@register()
+class GaussianBlur(T.GaussianBlur):
+    def __init__(self, kernel_size, sigma=(0.1, 2.0), p: float = 1.0):
+        super().__init__(kernel_size, sigma)
+        self.p = p
+
+    def __call__(self, *inputs: Any) -> Any:
+        if torch.rand(1) >= self.p:
+            return inputs if len(inputs) > 1 else inputs[0]
+        return super().forward(*inputs)
+
+
+@register()
+class RandomRotation(T.RandomRotation):
+    def __init__(self, degrees, interpolation=T.InterpolationMode.NEAREST, expand=False, center=None, fill=0, p: float = 1.0):
+        super().__init__(degrees, interpolation, expand, center, fill)
+        self.p = p
+
+    def __call__(self, *inputs: Any) -> Any:
+        if torch.rand(1) >= self.p:
+            return inputs if len(inputs) > 1 else inputs[0]
+        return super().forward(*inputs)
+
+
+@register()
+class RandomPerspective(T.RandomPerspective):
+    def __init__(self, distortion_scale: float = 0.5, p: float = 0.5, interpolation=T.InterpolationMode.BILINEAR, fill=0):
+        # Note: RandomPerspective already has 'p' but we override to ensure consistency
+        super().__init__(distortion_scale, p, interpolation, fill)
+
+
+@register()
+class RandomAdjustSharpness(T.RandomAdjustSharpness):
+    def __init__(self, sharpness_factor: float, p: float = 0.5):
+        # Note: RandomAdjustSharpness already has 'p' parameter
+        super().__init__(sharpness_factor, p)
+
+
+@register()
+class EmptyTransform(T.Transform):
+    def __init__(self, ) -> None:
+        super().__init__()
+
+    def forward(self, *inputs):
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        return inputs
+
+
+@register()
+class PadToSize(T.Pad):
+    _transformed_types = (
+        PIL.Image.Image,
+        Image,
+        Video,
+        Mask,
+        BoundingBoxes,
+    )
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        sp = F.get_spatial_size(flat_inputs[0])
+        h, w = self.size[1] - sp[0], self.size[0] - sp[1]
+        self.padding = [0, 0, w, h]
+        return dict(padding=self.padding)
+
+    def __init__(self, size, fill=0, padding_mode='constant') -> None:
+        if isinstance(size, int):
+            size = (size, size)
+        self.size = size
+        super().__init__(0, fill, padding_mode)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = self._fill[type(inpt)]
+        padding = params['padding']
+        return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
+
+    def __call__(self, *inputs: Any) -> Any:
+        outputs = super().forward(*inputs)
+        if len(outputs) > 1 and isinstance(outputs[1], dict):
+            outputs[1]['padding'] = torch.tensor(self.padding)
+        return outputs
+
+
+@register()
+class RandomIoUCrop(T.RandomIoUCrop):
+    def __init__(self, min_scale: float = 0.3, max_scale: float = 1, min_aspect_ratio: float = 0.5, max_aspect_ratio: float = 2, sampler_options: Optional[List[float]] = None, trials: int = 40, p: float = 1.0):
+        super().__init__(min_scale, max_scale, min_aspect_ratio, max_aspect_ratio, sampler_options, trials)
+        self.p = p
+
+    def __call__(self, *inputs: Any) -> Any:
+        if torch.rand(1) >= self.p:
+            return inputs if len(inputs) > 1 else inputs[0]
+
+        return super().forward(*inputs)
+
+
+@register()
+class ConvertBoxes(T.Transform):
+    _transformed_types = (
+        BoundingBoxes,
+    )
+    def __init__(self, fmt='', normalize=False) -> None:
+        super().__init__()
+        self.fmt = fmt
+        self.normalize = normalize
+    
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        spatial_size = getattr(inpt, _boxes_keys[1])
+        if self.fmt:
+            in_fmt = inpt.format.value.lower()
+            inpt = torchvision.ops.box_convert(inpt, in_fmt=in_fmt, out_fmt=self.fmt.lower())
+            inpt = convert_to_tv_tensor(inpt, key='boxes', box_format=self.fmt.upper(), spatial_size=spatial_size)
+
+        if self.normalize:
+            inpt = inpt / torch.tensor(spatial_size[::-1]).tile(2)[None]
+
+        return inpt
+
+
+@register()
+class ConvertPILImage(T.Transform):
+    _transformed_types = (
+        PIL.Image.Image,
+    )
+    def __init__(self, dtype='float32', scale=True) -> None:
+        super().__init__()
+        self.dtype = dtype
+        self.scale = scale
+    
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        inpt = F.pil_to_tensor(inpt)
+        if self.dtype == 'float32':
+            inpt = inpt.float()
+
+        if self.scale:
+            inpt = inpt / 255.
+
+        inpt = Image(inpt)
+
+        return inpt
diff --git a/engine/data/transforms/container.py b/deim/_engine/engine/data/transforms/container.py
similarity index 100%
rename from engine/data/transforms/container.py
rename to deim/_engine/engine/data/transforms/container.py
diff --git a/deim/_engine/engine/data/transforms/functional.py b/deim/_engine/engine/data/transforms/functional.py
new file mode 100644
index 00000000..140d39d0
--- /dev/null
+++ b/deim/_engine/engine/data/transforms/functional.py
@@ -0,0 +1,169 @@
+import torch
+import torchvision.transforms.functional as F
+
+from packaging import version
+from typing import Optional, List
+from torch import Tensor
+
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+if version.parse(torchvision.__version__) < version.parse('0.7'):
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if version.parse(torchvision.__version__) < version.parse('0.7'):
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(
+                input, size, scale_factor, mode, align_corners
+            )
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+
+    target = target.copy()
+    i, j, h, w = region
+
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+
+    fields = ["labels", "area", "iscrowd"]
+
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append("masks")
+
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+
+        for field in fields:
+            target[field] = target[field][keep]
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+
+    w, h = image.size
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
+
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        # r = min(size / min(h, w), max_size / max(h, w))
+        # ow = int(w * r)
+        # oh = int(h * r)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+
+    return rescaled_image, target
+
+
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
diff --git a/engine/data/transforms/mosaic.py b/deim/_engine/engine/data/transforms/mosaic.py
similarity index 100%
rename from engine/data/transforms/mosaic.py
rename to deim/_engine/engine/data/transforms/mosaic.py
diff --git a/deim/_engine/engine/deim/__init__.py b/deim/_engine/engine/deim/__init__.py
new file mode 100644
index 00000000..acd3dc3c
--- /dev/null
+++ b/deim/_engine/engine/deim/__init__.py
@@ -0,0 +1,18 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+from .deim import DEIM
+
+from .matcher import HungarianMatcher
+from .hybrid_encoder import HybridEncoder
+from .dfine_decoder import DFINETransformer
+from .rtdetrv2_decoder import RTDETRTransformerv2
+
+from .postprocessor import PostProcessor
+from .deim_criterion import DEIMCriterion
\ No newline at end of file
diff --git a/deim/_engine/engine/deim/box_ops.py b/deim/_engine/engine/deim/box_ops.py
new file mode 100644
index 00000000..ede1b324
--- /dev/null
+++ b/deim/_engine/engine/deim/box_ops.py
@@ -0,0 +1,90 @@
+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+https://github.com/facebookresearch/detr/blob/main/util/box_ops.py
+"""
+
+import torch
+from torch import Tensor
+from torchvision.ops.boxes import box_area
+
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w.clamp(min=0.0)), (y_c - 0.5 * h.clamp(min=0.0)),
+         (x_c + 0.5 * w.clamp(min=0.0)), (y_c + 0.5 * h.clamp(min=0.0))]
+    return torch.stack(b, dim=-1)
+
+
+def box_xyxy_to_cxcywh(x: Tensor) -> Tensor:
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1: Tensor, boxes2: Tensor):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    The boxes should be in [x0, y0, x1, y1] format
+
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
\ No newline at end of file
diff --git a/deim/_engine/engine/deim/deim.py b/deim/_engine/engine/deim/deim.py
new file mode 100644
index 00000000..7170fb5a
--- /dev/null
+++ b/deim/_engine/engine/deim/deim.py
@@ -0,0 +1,38 @@
+"""
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+"""
+
+import torch.nn as nn
+from ..core import register
+
+
+__all__ = ['DEIM', ]
+
+
+@register()
+class DEIM(nn.Module):
+    __inject__ = ['backbone', 'encoder', 'decoder', ]
+
+    def __init__(self, \
+        backbone: nn.Module,
+        encoder: nn.Module,
+        decoder: nn.Module,
+    ):
+        super().__init__()
+        self.backbone = backbone
+        self.decoder = decoder
+        self.encoder = encoder
+
+    def forward(self, x, targets=None):
+        x = self.backbone(x)
+        x = self.encoder(x)
+        x = self.decoder(x, targets)
+
+        return x
+
+    def deploy(self, ):
+        self.eval()
+        for m in self.modules():
+            if hasattr(m, 'convert_to_deploy'):
+                m.convert_to_deploy()
+        return self
diff --git a/deim/_engine/engine/deim/deim_criterion.py b/deim/_engine/engine/deim/deim_criterion.py
new file mode 100644
index 00000000..814364fe
--- /dev/null
+++ b/deim/_engine/engine/deim/deim_criterion.py
@@ -0,0 +1,489 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE/)
+Copyright (c) 2024 D-FINE Authors. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+import torch.distributed
+import torch.nn.functional as F
+import torchvision
+
+import copy
+
+from .dfine_utils import bbox2distance
+from .box_ops import box_cxcywh_to_xyxy, box_iou, generalized_box_iou
+from ..misc.dist_utils import get_world_size, is_dist_available_and_initialized
+from ..core import register
+
+
+@register()
+class DEIMCriterion(nn.Module):
+    """ This class computes the loss for DEIM.
+    """
+    __share__ = ['num_classes', ]
+    __inject__ = ['matcher', ]
+
+    def __init__(self, \
+        matcher,
+        weight_dict,
+        losses,
+        alpha=0.2,
+        gamma=2.0,
+        num_classes=80,
+        reg_max=32,
+        boxes_weight_format=None,
+        share_matched_indices=False,
+        mal_alpha=None,
+        use_uni_set=True,
+        ):
+        """Create the criterion.
+        Parameters:
+            matcher: module able to compute a matching between targets and proposals.
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+            num_classes: number of object categories, omitting the special no-object category.
+            reg_max (int): Max number of the discrete bins in D-FINE.
+            boxes_weight_format: format for boxes weight (iou, ).
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.losses = losses
+        self.boxes_weight_format = boxes_weight_format
+        self.share_matched_indices = share_matched_indices
+        self.alpha = alpha
+        self.gamma = gamma
+        self.fgl_targets, self.fgl_targets_dn = None, None
+        self.own_targets, self.own_targets_dn = None, None
+        self.reg_max = reg_max
+        self.num_pos, self.num_neg = None, None
+        self.mal_alpha = mal_alpha
+        self.use_uni_set = use_uni_set
+
+    def loss_labels_focal(self, outputs, targets, indices, num_boxes):
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target = F.one_hot(target_classes, num_classes=self.num_classes+1)[..., :-1]
+        loss = torchvision.ops.sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma, reduction='none')
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+
+        return {'loss_focal': loss}
+
+    def loss_labels_vfl(self, outputs, targets, indices, num_boxes, values=None):
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        if values is None:
+            src_boxes = outputs['pred_boxes'][idx]
+            target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            ious, _ = box_iou(box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes))
+            ious = torch.diag(ious).detach()
+        else:
+            ious = values
+
+        src_logits = outputs['pred_logits']
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+
+        target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype)
+        target_score_o[idx] = ious.to(target_score_o.dtype)
+        target_score = target_score_o.unsqueeze(-1) * target
+
+        pred_score = F.sigmoid(src_logits).detach()
+        weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score
+
+        loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none')
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        return {'loss_vfl': loss}
+
+    def loss_labels_mal(self, outputs, targets, indices, num_boxes, values=None):
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        if values is None:
+            src_boxes = outputs['pred_boxes'][idx]
+            target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            ious, _ = box_iou(box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes))
+            ious = torch.diag(ious).detach()
+        else:
+            ious = values
+
+        src_logits = outputs['pred_logits']
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+
+        target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype)
+        target_score_o[idx] = ious.to(target_score_o.dtype)
+        target_score = target_score_o.unsqueeze(-1) * target
+
+        pred_score = F.sigmoid(src_logits).detach()
+        target_score = target_score.pow(self.gamma)
+        if self.mal_alpha != None:
+            weight = self.mal_alpha * pred_score.pow(self.gamma) * (1 - target) + target
+        else:
+            weight = pred_score.pow(self.gamma) * (1 - target) + target
+
+        # print(" ### DEIM-gamma{}-alpha{} ### ".format(self.gamma, self.mal_alpha))
+        loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none')
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        return {'loss_mal': loss}
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes, boxes_weight=None):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        losses = {}
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(generalized_box_iou(\
+            box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes)))
+        loss_giou = loss_giou if boxes_weight is None else loss_giou * boxes_weight
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+
+        return losses
+
+    def loss_local(self, outputs, targets, indices, num_boxes, T=5):
+        """Compute Fine-Grained Localization (FGL) Loss
+            and Decoupled Distillation Focal (DDF) Loss. """
+
+        losses = {}
+        if 'pred_corners' in outputs:
+            idx = self._get_src_permutation_idx(indices)
+            target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+            pred_corners = outputs['pred_corners'][idx].reshape(-1, (self.reg_max+1))
+            ref_points = outputs['ref_points'][idx].detach()
+            with torch.no_grad():
+                if self.fgl_targets_dn is None and 'is_dn' in outputs:
+                        self.fgl_targets_dn= bbox2distance(ref_points, box_cxcywh_to_xyxy(target_boxes),
+                                                        self.reg_max, outputs['reg_scale'], outputs['up'])
+                if self.fgl_targets is None and 'is_dn' not in outputs:
+                        self.fgl_targets = bbox2distance(ref_points, box_cxcywh_to_xyxy(target_boxes),
+                                                        self.reg_max, outputs['reg_scale'], outputs['up'])
+
+            target_corners, weight_right, weight_left = self.fgl_targets_dn if 'is_dn' in outputs else self.fgl_targets
+
+            ious = torch.diag(box_iou(\
+                        box_cxcywh_to_xyxy(outputs['pred_boxes'][idx]), box_cxcywh_to_xyxy(target_boxes))[0])
+            weight_targets = ious.unsqueeze(-1).repeat(1, 1, 4).reshape(-1).detach()
+
+            losses['loss_fgl'] = self.unimodal_distribution_focal_loss(
+                pred_corners, target_corners, weight_right, weight_left, weight_targets, avg_factor=num_boxes)
+
+            if 'teacher_corners' in outputs:
+                pred_corners = outputs['pred_corners'].reshape(-1, (self.reg_max+1))
+                target_corners = outputs['teacher_corners'].reshape(-1, (self.reg_max+1))
+                if not torch.equal(pred_corners, target_corners):
+                    weight_targets_local = outputs['teacher_logits'].sigmoid().max(dim=-1)[0]
+
+                    mask = torch.zeros_like(weight_targets_local, dtype=torch.bool)
+                    mask[idx] = True
+                    mask = mask.unsqueeze(-1).repeat(1, 1, 4).reshape(-1)
+
+                    weight_targets_local[idx] = ious.reshape_as(weight_targets_local[idx]).to(weight_targets_local.dtype)
+                    weight_targets_local = weight_targets_local.unsqueeze(-1).repeat(1, 1, 4).reshape(-1).detach()
+
+                    loss_match_local = weight_targets_local * (T ** 2) * (nn.KLDivLoss(reduction='none')
+                    (F.log_softmax(pred_corners / T, dim=1), F.softmax(target_corners.detach() / T, dim=1))).sum(-1)
+                    if 'is_dn' not in outputs:
+                        batch_scale = 8 / outputs['pred_boxes'].shape[0]  # Avoid the influence of batch size per GPU
+                        self.num_pos, self.num_neg = (mask.sum() * batch_scale) ** 0.5, ((~mask).sum() * batch_scale) ** 0.5
+                    loss_match_local1 = loss_match_local[mask].mean() if mask.any() else 0
+                    loss_match_local2 = loss_match_local[~mask].mean() if (~mask).any() else 0
+                    losses['loss_ddf'] = (loss_match_local1 * self.num_pos + loss_match_local2 * self.num_neg) / (self.num_pos + self.num_neg)
+
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def _get_go_indices(self, indices, indices_aux_list):
+        """Get a matching union set across all decoder layers. """
+        results = []
+        for indices_aux in indices_aux_list:
+            indices = [(torch.cat([idx1[0], idx2[0]]), torch.cat([idx1[1], idx2[1]]))
+                        for idx1, idx2 in zip(indices.copy(), indices_aux.copy())]
+
+        for ind in [torch.cat([idx[0][:, None], idx[1][:, None]], 1) for idx in indices]:
+            unique, counts = torch.unique(ind, return_counts=True, dim=0)
+            count_sort_indices = torch.argsort(counts, descending=True)
+            unique_sorted = unique[count_sort_indices]
+            column_to_row = {}
+            for idx in unique_sorted:
+                row_idx, col_idx = idx[0].item(), idx[1].item()
+                if row_idx not in column_to_row:
+                    column_to_row[row_idx] = col_idx
+            final_rows = torch.tensor(list(column_to_row.keys()), device=ind.device)
+            final_cols = torch.tensor(list(column_to_row.values()), device=ind.device)
+            results.append((final_rows.long(), final_cols.long()))
+        return results
+
+    def _clear_cache(self):
+        self.fgl_targets, self.fgl_targets_dn = None, None
+        self.own_targets, self.own_targets_dn = None, None
+        self.num_pos, self.num_neg = None, None
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'boxes': self.loss_boxes,
+            'focal': self.loss_labels_focal,
+            'vfl': self.loss_labels_vfl,
+            'mal': self.loss_labels_mal,
+            'local': self.loss_local,
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def forward(self, outputs, targets, **kwargs):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)['indices']
+        self._clear_cache()
+
+        # Get the matching union set across all decoder layers.
+        if 'aux_outputs' in outputs:
+            indices_aux_list, cached_indices, cached_indices_enc = [], [], []
+            aux_outputs_list = outputs['aux_outputs']
+            if 'pre_outputs' in outputs:
+                aux_outputs_list = outputs['aux_outputs'] + [outputs['pre_outputs']]
+            for i, aux_outputs in enumerate(aux_outputs_list):
+                indices_aux = self.matcher(aux_outputs, targets)['indices']
+                cached_indices.append(indices_aux)
+                indices_aux_list.append(indices_aux)
+            for i, aux_outputs in enumerate(outputs['enc_aux_outputs']):
+                indices_enc = self.matcher(aux_outputs, targets)['indices']
+                cached_indices_enc.append(indices_enc)
+                indices_aux_list.append(indices_enc)
+            indices_go = self._get_go_indices(indices, indices_aux_list)
+
+            num_boxes_go = sum(len(x[0]) for x in indices_go)
+            num_boxes_go = torch.as_tensor([num_boxes_go], dtype=torch.float, device=next(iter(outputs.values())).device)
+            if is_dist_available_and_initialized():
+                torch.distributed.all_reduce(num_boxes_go)
+            num_boxes_go = torch.clamp(num_boxes_go / get_world_size(), min=1).item()
+        else:
+            assert 'aux_outputs' in outputs, ''
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        if is_dist_available_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+
+        # Compute all the requested losses, main loss
+        losses = {}
+        for loss in self.losses:
+            # TODO, indices and num_box are different from RT-DETRv2
+            use_uni_set = self.use_uni_set and (loss in ['boxes', 'local'])
+            indices_in = indices_go if use_uni_set else indices
+            num_boxes_in = num_boxes_go if use_uni_set else num_boxes
+            meta = self.get_loss_meta_info(loss, outputs, targets, indices_in)
+            l_dict = self.get_loss(loss, outputs, targets, indices_in, num_boxes_in, **meta)
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                if 'local' in self.losses:      # only work for local loss
+                    aux_outputs['up'], aux_outputs['reg_scale'] = outputs['up'], outputs['reg_scale']
+                for loss in self.losses:
+                    # TODO, indices and num_box are different from RT-DETRv2
+                    use_uni_set = self.use_uni_set and (loss in ['boxes', 'local'])
+                    indices_in = indices_go if use_uni_set else cached_indices[i]
+                    num_boxes_in = num_boxes_go if use_uni_set else num_boxes
+                    meta = self.get_loss_meta_info(loss, aux_outputs, targets, indices_in)
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices_in, num_boxes_in, **meta)
+
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_aux_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # In case of auxiliary traditional head output at first decoder layer. just for dfine
+        if 'pre_outputs' in outputs:
+            aux_outputs = outputs['pre_outputs']
+            for loss in self.losses:
+                # TODO, indices and num_box are different from RT-DETRv2
+                use_uni_set = self.use_uni_set and (loss in ['boxes', 'local'])
+                indices_in = indices_go if use_uni_set else cached_indices[-1]
+                num_boxes_in = num_boxes_go if use_uni_set else num_boxes
+                meta = self.get_loss_meta_info(loss, aux_outputs, targets, indices_in)
+                l_dict = self.get_loss(loss, aux_outputs, targets, indices_in, num_boxes_in, **meta)
+
+                l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                l_dict = {k + '_pre': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        # In case of encoder auxiliary losses.
+        if 'enc_aux_outputs' in outputs:
+            assert 'enc_meta' in outputs, ''
+            class_agnostic = outputs['enc_meta']['class_agnostic']
+            if class_agnostic:
+                orig_num_classes = self.num_classes
+                self.num_classes = 1
+                enc_targets = copy.deepcopy(targets)
+                for t in enc_targets:
+                    t['labels'] = torch.zeros_like(t["labels"])
+            else:
+                enc_targets = targets
+
+            for i, aux_outputs in enumerate(outputs['enc_aux_outputs']):
+                for loss in self.losses:
+                    # TODO, indices and num_box are different from RT-DETRv2
+                    use_uni_set = self.use_uni_set and (loss == 'boxes')
+                    indices_in = indices_go if use_uni_set else cached_indices_enc[i]
+                    num_boxes_in = num_boxes_go if use_uni_set else num_boxes
+                    meta = self.get_loss_meta_info(loss, aux_outputs, enc_targets, indices_in)
+                    l_dict = self.get_loss(loss, aux_outputs, enc_targets, indices_in, num_boxes_in, **meta)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_enc_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+            if class_agnostic:
+                self.num_classes = orig_num_classes
+
+        # In case of cdn auxiliary losses.
+        if 'dn_outputs' in outputs:
+            assert 'dn_meta' in outputs, ''
+            indices_dn = self.get_cdn_matched_indices(outputs['dn_meta'], targets)
+            dn_num_boxes = num_boxes * outputs['dn_meta']['dn_num_group']
+
+            for i, aux_outputs in enumerate(outputs['dn_outputs']):
+                if 'local' in self.losses:      # only work for local loss
+                    aux_outputs['is_dn'] = True
+                    aux_outputs['up'], aux_outputs['reg_scale'] = outputs['up'], outputs['reg_scale']
+                for loss in self.losses:
+                    meta = self.get_loss_meta_info(loss, aux_outputs, targets, indices_dn)
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices_dn, dn_num_boxes, **meta)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_dn_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+            # In case of auxiliary traditional head output at first decoder layer, just for dfine
+            if 'dn_pre_outputs' in outputs:
+                aux_outputs = outputs['dn_pre_outputs']
+                for loss in self.losses:
+                    meta = self.get_loss_meta_info(loss, aux_outputs, targets, indices_dn)
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices_dn, dn_num_boxes, **meta)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + '_dn_pre': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # For debugging Objects365 pre-train.
+        losses = {k:torch.nan_to_num(v, nan=0.0) for k, v in losses.items()}
+        return losses
+
+    def get_loss_meta_info(self, loss, outputs, targets, indices):
+        if self.boxes_weight_format is None:
+            return {}
+
+        src_boxes = outputs['pred_boxes'][self._get_src_permutation_idx(indices)]
+        target_boxes = torch.cat([t['boxes'][j] for t, (_, j) in zip(targets, indices)], dim=0)
+
+        if self.boxes_weight_format == 'iou':
+            iou, _ = box_iou(box_cxcywh_to_xyxy(src_boxes.detach()), box_cxcywh_to_xyxy(target_boxes))
+            iou = torch.diag(iou)
+        elif self.boxes_weight_format == 'giou':
+            iou = torch.diag(generalized_box_iou(\
+                box_cxcywh_to_xyxy(src_boxes.detach()), box_cxcywh_to_xyxy(target_boxes)))
+        else:
+            raise AttributeError()
+
+        if loss in ('boxes', ):
+            meta = {'boxes_weight': iou}
+        elif loss in ('vfl', 'mal'):
+            meta = {'values': iou}
+        else:
+            meta = {}
+
+        return meta
+
+    @staticmethod
+    def get_cdn_matched_indices(dn_meta, targets):
+        """get_cdn_matched_indices
+        """
+        dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+        num_gts = [len(t['labels']) for t in targets]
+        device = targets[0]['labels'].device
+
+        dn_match_indices = []
+        for i, num_gt in enumerate(num_gts):
+            if num_gt > 0:
+                gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device)
+                gt_idx = gt_idx.tile(dn_num_group)
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append((torch.zeros(0, dtype=torch.int64, device=device), \
+                    torch.zeros(0, dtype=torch.int64,  device=device)))
+
+        return dn_match_indices
+
+
+    def feature_loss_function(self, fea, target_fea):
+        loss = (fea - target_fea) ** 2 * ((fea > 0) | (target_fea > 0)).float()
+        return torch.abs(loss)
+
+
+    def unimodal_distribution_focal_loss(self, pred, label, weight_right, weight_left, weight=None, reduction='sum', avg_factor=None):
+        dis_left = label.long()
+        dis_right = dis_left + 1
+
+        loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left.reshape(-1) \
+             + F.cross_entropy(pred, dis_right, reduction='none') * weight_right.reshape(-1)
+
+        if weight is not None:
+            weight = weight.float()
+            loss = loss * weight
+
+        if avg_factor is not None:
+            loss = loss.sum() / avg_factor
+        elif reduction == 'mean':
+            loss = loss.mean()
+        elif reduction == 'sum':
+            loss = loss.sum()
+
+        return loss
+
+    def get_gradual_steps(self, outputs):
+        num_layers = len(outputs['aux_outputs']) + 1 if 'aux_outputs' in outputs else 1
+        step = .5 / (num_layers - 1)
+        opt_list = [.5  + step * i for i in range(num_layers)] if num_layers > 1 else [1]
+        return opt_list
diff --git a/deim/_engine/engine/deim/denoising.py b/deim/_engine/engine/deim/denoising.py
new file mode 100644
index 00000000..98d8218b
--- /dev/null
+++ b/deim/_engine/engine/deim/denoising.py
@@ -0,0 +1,107 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+Modifications Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+"""
+
+import torch
+
+from .utils import inverse_sigmoid
+from .box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+
+
+
+def get_contrastive_denoising_training_group(targets,
+                                             num_classes,
+                                             num_queries,
+                                             class_embed,
+                                             num_denoising=100,
+                                             label_noise_ratio=0.5,
+                                             box_noise_scale=1.0,):
+    """cnd"""
+    if num_denoising <= 0:
+        return None, None, None, None
+
+    num_gts = [len(t['labels']) for t in targets]
+    device = targets[0]['labels'].device
+
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(num_gts)
+
+    input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device)
+    input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device)
+    pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device)
+
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets[i]['labels']
+            input_query_bbox[i, :num_gt] = targets[i]['boxes']
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_group])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
+    # positive and negative mask
+    negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device)
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
+    dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * 2 * num_group)
+
+    if label_noise_ratio > 0:
+        mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
+        # randomly put a new one here
+        new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
+
+    if box_noise_scale > 0:
+        known_bbox = box_cxcywh_to_xyxy(input_query_bbox)
+        diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
+        rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = torch.rand_like(input_query_bbox)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
+        known_bbox += (rand_sign * rand_part * diff)
+        known_bbox = torch.clip(known_bbox, min=0.0, max=1.0)
+        input_query_bbox = box_xyxy_to_cxcywh(known_bbox)
+        # FIXME, RT-DETR do not have this 
+        input_query_bbox[input_query_bbox < 0] *= -1
+        input_query_bbox_unact = inverse_sigmoid(input_query_bbox)
+
+    input_query_logits = class_embed(input_query_class)
+
+    tgt_size = num_denoising + num_queries
+    attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device)
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True
+        else:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True
+
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries]
+    }
+
+    # print(input_query_class.shape) # torch.Size([4, 196, 256])
+    # print(input_query_bbox.shape) # torch.Size([4, 196, 4])
+    # print(attn_mask.shape) # torch.Size([496, 496])
+
+    return input_query_logits, input_query_bbox_unact, attn_mask, dn_meta
diff --git a/deim/_engine/engine/deim/dfine_decoder.py b/deim/_engine/engine/deim/dfine_decoder.py
new file mode 100644
index 00000000..c791d8b9
--- /dev/null
+++ b/deim/_engine/engine/deim/dfine_decoder.py
@@ -0,0 +1,792 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE/)
+Copyright (c) 2024 D-FINE Authors. All Rights Reserved.
+"""
+
+import math
+import copy
+import functools
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from typing import List
+
+from .dfine_utils import weighting_function, distance2bbox
+from .denoising import get_contrastive_denoising_training_group
+from .utils import deformable_attention_core_func_v2, get_activation, inverse_sigmoid
+from .utils import bias_init_with_prob
+from ..core import register
+
+__all__ = ['DFINETransformer']
+
+
+class MLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act='relu'):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+        self.act = get_activation(act)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class MSDeformableAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim=256,
+        num_heads=8,
+        num_levels=4,
+        num_points=4,
+        method='default',
+        offset_scale=0.5,
+    ):
+        """Multi-Scale Deformable Attention
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.offset_scale = offset_scale
+
+        if isinstance(num_points, list):
+            assert len(num_points) == num_levels, ''
+            num_points_list = num_points
+        else:
+            num_points_list = [num_points for _ in range(num_levels)]
+
+        self.num_points_list = num_points_list
+
+        num_points_scale = [1/n for n in num_points_list for _ in range(n)]
+        self.register_buffer('num_points_scale', torch.tensor(num_points_scale, dtype=torch.float32))
+
+        self.total_points = num_heads * sum(num_points_list)
+        self.method = method
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2)
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+
+        self.ms_deformable_attn_core = functools.partial(deformable_attention_core_func_v2, method=self.method)
+
+        self._reset_parameters()
+
+        if method == 'discrete':
+            for p in self.sampling_offsets.parameters():
+                p.requires_grad = False
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        init.constant_(self.sampling_offsets.weight, 0)
+        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values
+        grid_init = grid_init.reshape(self.num_heads, 1, 2).tile([1, sum(self.num_points_list), 1])
+        scaling = torch.concat([torch.arange(1, n + 1) for n in self.num_points_list]).reshape(1, -1, 1)
+        grid_init *= scaling
+        self.sampling_offsets.bias.data[...] = grid_init.flatten()
+
+        # attention_weights
+        init.constant_(self.attention_weights.weight, 0)
+        init.constant_(self.attention_weights.bias, 0)
+
+
+    def forward(self,
+                query: torch.Tensor,
+                reference_points: torch.Tensor,
+                value: torch.Tensor,
+                value_spatial_shapes: List[int]):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+
+        sampling_offsets: torch.Tensor = self.sampling_offsets(query)
+        sampling_offsets = sampling_offsets.reshape(bs, Len_q, self.num_heads, sum(self.num_points_list), 2)
+
+        attention_weights = self.attention_weights(query).reshape(bs, Len_q, self.num_heads, sum(self.num_points_list))
+        attention_weights = F.softmax(attention_weights, dim=-1)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.num_levels, 1, 2)
+            sampling_locations = reference_points.reshape(bs, Len_q, 1, self.num_levels, 1, 2) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            # reference_points [8, 480, None, 1,  4]
+            # sampling_offsets [8, 480, 8,    12, 2]
+            num_points_scale = self.num_points_scale.to(dtype=query.dtype).unsqueeze(-1)
+            offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * self.offset_scale
+            sampling_locations = reference_points[:, :, None, :, :2] + offset
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        output = self.ms_deformable_attn_core(value, value_spatial_shapes, sampling_locations, attention_weights, self.num_points_list)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation='relu',
+                 n_levels=4,
+                 n_points=4,
+                 cross_attn_method='default',
+                 layer_scale=None):
+        super(TransformerDecoderLayer, self).__init__()
+        if layer_scale is not None:
+            dim_feedforward = round(layer_scale * dim_feedforward)
+            d_model = round(layer_scale * d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout, batch_first=True)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points, \
+                                                method=cross_attn_method)
+        self.dropout2 = nn.Dropout(dropout)
+
+        # gate
+        self.gateway = Gate(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = get_activation(activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        init.xavier_uniform_(self.linear1.weight)
+        init.xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                target,
+                reference_points,
+                value,
+                spatial_shapes,
+                attn_mask=None,
+                query_pos_embed=None):
+
+        # self attention
+        q = k = self.with_pos_embed(target, query_pos_embed)
+
+        target2, _ = self.self_attn(q, k, value=target, attn_mask=attn_mask)
+        target = target + self.dropout1(target2)
+        target = self.norm1(target)
+
+        # cross attention
+        target2 = self.cross_attn(\
+            self.with_pos_embed(target, query_pos_embed),
+            reference_points,
+            value,
+            spatial_shapes)
+
+        target = self.gateway(target, self.dropout2(target2))
+
+        # ffn
+        target2 = self.forward_ffn(target)
+        target = target + self.dropout4(target2)
+        target = self.norm3(target.clamp(min=-65504, max=65504))
+
+        return target
+
+
+class Gate(nn.Module):
+    def __init__(self, d_model):
+        super(Gate, self).__init__()
+        self.gate = nn.Linear(2 * d_model, 2 * d_model)
+        bias = bias_init_with_prob(0.5)
+        init.constant_(self.gate.bias, bias)
+        init.constant_(self.gate.weight, 0)
+        self.norm = nn.LayerNorm(d_model)
+
+    def forward(self, x1, x2):
+        gate_input = torch.cat([x1, x2], dim=-1)
+        gates = torch.sigmoid(self.gate(gate_input))
+        gate1, gate2 = gates.chunk(2, dim=-1)
+        return self.norm(gate1 * x1 + gate2 * x2)
+
+
+class Integral(nn.Module):
+    """
+    A static layer that calculates integral results from a distribution.
+
+    This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`,
+    where Pr(n) is the softmax probability vector representing the discrete
+    distribution, and W(n) is the non-uniform Weighting Function.
+
+    Args:
+        reg_max (int): Max number of the discrete bins. Default is 32.
+                       It can be adjusted based on the dataset or task requirements.
+    """
+
+    def __init__(self, reg_max=32):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+
+    def forward(self, x, project):
+        shape = x.shape
+        x = F.softmax(x.reshape(-1, self.reg_max + 1), dim=1)
+        x = F.linear(x, project.to(x.device)).reshape(-1, 4)
+        return x.reshape(list(shape[:-1]) + [-1])
+
+
+class LQE(nn.Module):
+    def __init__(self, k, hidden_dim, num_layers, reg_max, act='relu'):
+        super(LQE, self).__init__()
+        self.k = k
+        self.reg_max = reg_max
+        self.reg_conf = MLP(4 * (k + 1), hidden_dim, 1, num_layers, act=act)
+        init.constant_(self.reg_conf.layers[-1].bias, 0)
+        init.constant_(self.reg_conf.layers[-1].weight, 0)
+
+    def forward(self, scores, pred_corners):
+        B, L, _ = pred_corners.size()
+        prob = F.softmax(pred_corners.reshape(B, L, 4, self.reg_max+1), dim=-1)
+        prob_topk, _ = prob.topk(self.k, dim=-1)
+        stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1)
+        quality_score = self.reg_conf(stat.reshape(B, L, -1))
+        return scores + quality_score
+
+
+class TransformerDecoder(nn.Module):
+    """
+    Transformer Decoder implementing Fine-grained Distribution Refinement (FDR).
+
+    This decoder refines object detection predictions through iterative updates across multiple layers,
+    utilizing attention mechanisms, location quality estimators, and distribution refinement techniques
+    to improve bounding box accuracy and robustness.
+    """
+
+    def __init__(self, hidden_dim, decoder_layer, decoder_layer_wide, num_layers, num_head, reg_max, reg_scale, up,
+                 eval_idx=-1, layer_scale=2, act='relu'):
+        super(TransformerDecoder, self).__init__()
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.layer_scale = layer_scale
+        self.num_head = num_head
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+        self.up, self.reg_scale, self.reg_max = up, reg_scale, reg_max
+        self.layers = nn.ModuleList([copy.deepcopy(decoder_layer) for _ in range(self.eval_idx + 1)] \
+                    + [copy.deepcopy(decoder_layer_wide) for _ in range(num_layers - self.eval_idx - 1)])
+        self.lqe_layers = nn.ModuleList([copy.deepcopy(LQE(4, 64, 2, reg_max, act=act)) for _ in range(num_layers)])
+
+    def value_op(self, memory, value_proj, value_scale, memory_mask, memory_spatial_shapes):
+        """
+        Preprocess values for MSDeformableAttention.
+        """
+        value = value_proj(memory) if value_proj is not None else memory
+        value = F.interpolate(memory, size=value_scale) if value_scale is not None else value
+        if memory_mask is not None:
+            value = value * memory_mask.to(value.dtype).unsqueeze(-1)
+        value = value.reshape(value.shape[0], value.shape[1], self.num_head, -1)
+        split_shape = [h * w for h, w in memory_spatial_shapes]
+        return value.permute(0, 2, 3, 1).split(split_shape, dim=-1)
+
+    def convert_to_deploy(self):
+        self.project = weighting_function(self.reg_max, self.up, self.reg_scale, deploy=True)
+        self.layers = self.layers[:self.eval_idx + 1]
+        self.lqe_layers = nn.ModuleList([nn.Identity()] * (self.eval_idx) + [self.lqe_layers[self.eval_idx]])
+
+    def forward(self,
+                target,
+                ref_points_unact,
+                memory,
+                spatial_shapes,
+                bbox_head,
+                score_head,
+                query_pos_head,
+                pre_bbox_head,
+                integral,
+                up,
+                reg_scale,
+                attn_mask=None,
+                memory_mask=None,
+                dn_meta=None):
+        output = target
+        output_detach = pred_corners_undetach = 0
+        value = self.value_op(memory, None, None, memory_mask, spatial_shapes)
+
+        dec_out_bboxes = []
+        dec_out_logits = []
+        dec_out_pred_corners = []
+        dec_out_refs = []
+        if not hasattr(self, 'project'):
+            project = weighting_function(self.reg_max, up, reg_scale)
+        else:
+            project = self.project
+
+        ref_points_detach = F.sigmoid(ref_points_unact)
+
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = query_pos_head(ref_points_detach).clamp(min=-10, max=10)
+
+            # TODO Adjust scale if needed for detachable wider layers
+            if i >= self.eval_idx + 1 and self.layer_scale > 1:
+                query_pos_embed = F.interpolate(query_pos_embed, scale_factor=self.layer_scale)
+                value = self.value_op(memory, None, query_pos_embed.shape[-1], memory_mask, spatial_shapes)
+                output = F.interpolate(output, size=query_pos_embed.shape[-1])
+                output_detach = output.detach()
+
+            output = layer(output, ref_points_input, value, spatial_shapes, attn_mask, query_pos_embed)
+
+            if i == 0 :
+                # Initial bounding box predictions with inverse sigmoid refinement
+                pre_bboxes = F.sigmoid(pre_bbox_head(output) + inverse_sigmoid(ref_points_detach))
+                pre_scores = score_head[0](output)
+                ref_points_initial = pre_bboxes.detach()
+
+            # Refine bounding box corners using FDR, integrating previous layer's corrections
+            pred_corners = bbox_head[i](output + output_detach) + pred_corners_undetach
+            inter_ref_bbox = distance2bbox(ref_points_initial, integral(pred_corners, project), reg_scale)
+
+            if self.training or i == self.eval_idx:
+                scores = score_head[i](output)
+                # Lqe does not affect the performance here.
+                scores = self.lqe_layers[i](scores, pred_corners)
+                dec_out_logits.append(scores)
+                dec_out_bboxes.append(inter_ref_bbox)
+                dec_out_pred_corners.append(pred_corners)
+                dec_out_refs.append(ref_points_initial)
+
+                if not self.training:
+                    break
+
+            pred_corners_undetach = pred_corners
+            ref_points_detach = inter_ref_bbox.detach()
+            output_detach = output.detach()
+
+        return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits), \
+               torch.stack(dec_out_pred_corners), torch.stack(dec_out_refs), pre_bboxes, pre_scores
+
+
+@register()
+class DFINETransformer(nn.Module):
+    __share__ = ['num_classes', 'eval_spatial_size']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=300,
+                 feat_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 num_levels=3,
+                 num_points=4,
+                 nhead=8,
+                 num_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learn_query_content=False,
+                 eval_spatial_size=None,
+                 eval_idx=-1,
+                 eps=1e-2,
+                 aux_loss=True,
+                 cross_attn_method='default',
+                 query_select_method='default',
+                 reg_max=32,
+                 reg_scale=4.,
+                 layer_scale=1,
+                 mlp_act='relu',
+                 ):
+        super().__init__()
+        assert len(feat_channels) <= num_levels
+        assert len(feat_strides) == len(feat_channels)
+
+        for _ in range(num_levels - len(feat_strides)):
+            feat_strides.append(feat_strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        scaled_dim = round(layer_scale*hidden_dim)
+        self.nhead = nhead
+        self.feat_strides = feat_strides
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_layers = num_layers
+        self.eval_spatial_size = eval_spatial_size
+        self.aux_loss = aux_loss
+        self.reg_max = reg_max
+
+        assert query_select_method in ('default', 'one2many', 'agnostic'), ''
+        assert cross_attn_method in ('default', 'discrete'), ''
+        self.cross_attn_method = cross_attn_method
+        self.query_select_method = query_select_method
+
+        # backbone feature projection
+        self._build_input_proj_layer(feat_channels)
+
+        # Transformer module
+        self.up = nn.Parameter(torch.tensor([0.5]), requires_grad=False)
+        self.reg_scale = nn.Parameter(torch.tensor([reg_scale]), requires_grad=False)
+        decoder_layer = TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, \
+            activation, num_levels, num_points, cross_attn_method=cross_attn_method)
+        decoder_layer_wide = TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, \
+            activation, num_levels, num_points, cross_attn_method=cross_attn_method, layer_scale=layer_scale)
+        self.decoder = TransformerDecoder(hidden_dim, decoder_layer, decoder_layer_wide, num_layers, nhead,
+                                          reg_max, self.reg_scale, self.up, eval_idx, layer_scale, act=activation)
+      # denoising
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+        if num_denoising > 0:
+            self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes)
+            init.normal_(self.denoising_class_embed.weight[:-1])
+
+        # decoder embedding
+        self.learn_query_content = learn_query_content
+        if learn_query_content:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, 2, act=mlp_act)
+
+        # if num_select_queries != self.num_queries:
+        #     layer = TransformerEncoderLayer(hidden_dim, nhead, dim_feedforward, activation='gelu')
+        #     self.encoder = TransformerEncoder(layer, 1)
+
+        self.enc_output = nn.Sequential(OrderedDict([
+            ('proj', nn.Linear(hidden_dim, hidden_dim)),
+            ('norm', nn.LayerNorm(hidden_dim,)),
+        ]))
+
+        if query_select_method == 'agnostic':
+            self.enc_score_head = nn.Linear(hidden_dim, 1)
+        else:
+            self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3, act=mlp_act)
+
+        # decoder head
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+        self.dec_score_head = nn.ModuleList(
+            [nn.Linear(hidden_dim, num_classes) for _ in range(self.eval_idx + 1)]
+          + [nn.Linear(scaled_dim, num_classes) for _ in range(num_layers - self.eval_idx - 1)])
+        self.pre_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3, act=mlp_act)
+        self.dec_bbox_head = nn.ModuleList(
+            [MLP(hidden_dim, hidden_dim, 4 * (self.reg_max+1), 3, act=mlp_act) for _ in range(self.eval_idx + 1)]
+          + [MLP(scaled_dim, scaled_dim, 4 * (self.reg_max+1), 3, act=mlp_act) for _ in range(num_layers - self.eval_idx - 1)])
+        self.integral = Integral(self.reg_max)
+
+        # init encoder output anchors and valid_mask
+        if self.eval_spatial_size:
+            anchors, valid_mask = self._generate_anchors()
+            self.register_buffer('anchors', anchors)
+            self.register_buffer('valid_mask', valid_mask)
+        # init encoder output anchors and valid_mask
+        if self.eval_spatial_size:
+            self.anchors, self.valid_mask = self._generate_anchors()
+
+
+        self._reset_parameters(feat_channels)
+
+    def convert_to_deploy(self):
+        self.dec_score_head = nn.ModuleList([nn.Identity()] * (self.eval_idx) + [self.dec_score_head[self.eval_idx]])
+        self.dec_bbox_head = nn.ModuleList(
+            [self.dec_bbox_head[i] if i <= self.eval_idx else nn.Identity() for i in range(len(self.dec_bbox_head))]
+        )
+
+    def _reset_parameters(self, feat_channels):
+        bias = bias_init_with_prob(0.01)
+        init.constant_(self.enc_score_head.bias, bias)
+        init.constant_(self.enc_bbox_head.layers[-1].weight, 0)
+        init.constant_(self.enc_bbox_head.layers[-1].bias, 0)
+
+        init.constant_(self.pre_bbox_head.layers[-1].weight, 0)
+        init.constant_(self.pre_bbox_head.layers[-1].bias, 0)
+
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            init.constant_(cls_.bias, bias)
+            if hasattr(reg_, 'layers'):
+                init.constant_(reg_.layers[-1].weight, 0)
+                init.constant_(reg_.layers[-1].bias, 0)
+
+        init.xavier_uniform_(self.enc_output[0].weight)
+        if self.learn_query_content:
+            init.xavier_uniform_(self.tgt_embed.weight)
+        init.xavier_uniform_(self.query_pos_head.layers[0].weight)
+        init.xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for m, in_channels in zip(self.input_proj, feat_channels):
+            if in_channels != self.hidden_dim:
+                init.xavier_uniform_(m[0].weight)
+
+    def _build_input_proj_layer(self, feat_channels):
+        self.input_proj = nn.ModuleList()
+        for in_channels in feat_channels:
+            if in_channels == self.hidden_dim:
+                self.input_proj.append(nn.Identity())
+            else:
+                self.input_proj.append(
+                    nn.Sequential(OrderedDict([
+                        ('conv', nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)),
+                        ('norm', nn.BatchNorm2d(self.hidden_dim,))])
+                    )
+                )
+
+        in_channels = feat_channels[-1]
+
+        for _ in range(self.num_levels - len(feat_channels)):
+            if in_channels == self.hidden_dim:
+                self.input_proj.append(nn.Identity())
+            else:
+                self.input_proj.append(
+                    nn.Sequential(OrderedDict([
+                        ('conv', nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)),
+                        ('norm', nn.BatchNorm2d(self.hidden_dim))])
+                    )
+                )
+                in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats: List[torch.Tensor]):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        for i, feat in enumerate(proj_feats):
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
+            # [num_levels, 2]
+            spatial_shapes.append([h, w])
+
+        # [b, l, c]
+        feat_flatten = torch.concat(feat_flatten, 1)
+        return feat_flatten, spatial_shapes
+
+    def _generate_anchors(self,
+                          spatial_shapes=None,
+                          grid_size=0.05,
+                          dtype=torch.float32,
+                          device='cpu'):
+        if spatial_shapes is None:
+            spatial_shapes = []
+            eval_h, eval_w = self.eval_spatial_size
+            for s in self.feat_strides:
+                spatial_shapes.append([int(eval_h / s), int(eval_w / s)])
+
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing='ij')
+            grid_xy = torch.stack([grid_x, grid_y], dim=-1)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor([w, h], dtype=dtype)
+            wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl)
+            lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4)
+            anchors.append(lvl_anchors)
+
+        anchors = torch.concat(anchors, dim=1).to(device)
+        valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True)
+        anchors = torch.log(anchors / (1 - anchors))
+        anchors = torch.where(valid_mask, anchors, torch.inf)
+
+        return anchors, valid_mask
+
+
+    def _get_decoder_input(self,
+                           memory: torch.Tensor,
+                           spatial_shapes,
+                           denoising_logits=None,
+                           denoising_bbox_unact=None):
+
+        # prepare input for decoder
+        if self.training or self.eval_spatial_size is None:
+            anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device)
+        else:
+            anchors = self.anchors
+            valid_mask = self.valid_mask
+        if memory.shape[0] > 1:
+            anchors = anchors.repeat(memory.shape[0], 1, 1)
+
+        # memory = torch.where(valid_mask, memory, 0)
+        # TODO fix type error for onnx export
+        memory = valid_mask.to(memory.dtype) * memory
+
+        output_memory :torch.Tensor = self.enc_output(memory)
+        enc_outputs_logits :torch.Tensor = self.enc_score_head(output_memory)
+
+        enc_topk_bboxes_list, enc_topk_logits_list = [], []
+        enc_topk_memory, enc_topk_logits, enc_topk_anchors = \
+            self._select_topk(output_memory, enc_outputs_logits, anchors, self.num_queries)
+
+        enc_topk_bbox_unact :torch.Tensor = self.enc_bbox_head(enc_topk_memory) + enc_topk_anchors
+
+        if self.training:
+            enc_topk_bboxes = F.sigmoid(enc_topk_bbox_unact)
+            enc_topk_bboxes_list.append(enc_topk_bboxes)
+            enc_topk_logits_list.append(enc_topk_logits)
+
+        # if self.num_select_queries != self.num_queries:
+        #     raise NotImplementedError('')
+
+        if self.learn_query_content:
+            content = self.tgt_embed.weight.unsqueeze(0).tile([memory.shape[0], 1, 1])
+        else:
+            content = enc_topk_memory.detach()
+
+        enc_topk_bbox_unact = enc_topk_bbox_unact.detach()
+
+        if denoising_bbox_unact is not None:
+            enc_topk_bbox_unact = torch.concat([denoising_bbox_unact, enc_topk_bbox_unact], dim=1)
+            content = torch.concat([denoising_logits, content], dim=1)
+
+        return content, enc_topk_bbox_unact, enc_topk_bboxes_list, enc_topk_logits_list
+
+    def _select_topk(self, memory: torch.Tensor, outputs_logits: torch.Tensor, outputs_anchors_unact: torch.Tensor, topk: int):
+        if self.query_select_method == 'default':
+            _, topk_ind = torch.topk(outputs_logits.max(-1).values, topk, dim=-1)
+
+        elif self.query_select_method == 'one2many':
+            _, topk_ind = torch.topk(outputs_logits.flatten(1), topk, dim=-1)
+            topk_ind = topk_ind // self.num_classes
+
+        elif self.query_select_method == 'agnostic':
+            _, topk_ind = torch.topk(outputs_logits.squeeze(-1), topk, dim=-1)
+
+        topk_ind: torch.Tensor
+
+        topk_anchors = outputs_anchors_unact.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_anchors_unact.shape[-1]))
+
+        topk_logits = outputs_logits.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_logits.shape[-1])) if self.training else None
+
+        topk_memory = memory.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, memory.shape[-1]))
+
+        return topk_memory, topk_logits, topk_anchors
+
+    def forward(self, feats, targets=None):
+        # input projection and embedding
+        memory, spatial_shapes = self._get_encoder_input(feats)
+
+        # prepare denoising training
+        if self.training and self.num_denoising > 0:
+            denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(targets, \
+                    self.num_classes,
+                    self.num_queries,
+                    self.denoising_class_embed,
+                    num_denoising=self.num_denoising,
+                    label_noise_ratio=self.label_noise_ratio,
+                    box_noise_scale=1.0,
+                    )
+        else:
+            denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        init_ref_contents, init_ref_points_unact, enc_topk_bboxes_list, enc_topk_logits_list = \
+            self._get_decoder_input(memory, spatial_shapes, denoising_logits, denoising_bbox_unact)
+
+        # decoder
+        out_bboxes, out_logits, out_corners, out_refs, pre_bboxes, pre_logits = self.decoder(
+            init_ref_contents,
+            init_ref_points_unact,
+            memory,
+            spatial_shapes,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            self.pre_bbox_head,
+            self.integral,
+            self.up,
+            self.reg_scale,
+            attn_mask=attn_mask,
+            dn_meta=dn_meta)
+
+        if self.training and dn_meta is not None:
+            # the output from the first decoder layer, only one
+            dn_pre_logits, pre_logits = torch.split(pre_logits, dn_meta['dn_num_split'], dim=1)
+            dn_pre_bboxes, pre_bboxes = torch.split(pre_bboxes, dn_meta['dn_num_split'], dim=1)
+
+            dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2)
+            dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2)
+
+            dn_out_corners, out_corners = torch.split(out_corners, dn_meta['dn_num_split'], dim=2)
+            dn_out_refs, out_refs = torch.split(out_refs, dn_meta['dn_num_split'], dim=2)
+
+
+        if self.training:
+            out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1], 'pred_corners': out_corners[-1],
+                   'ref_points': out_refs[-1], 'up': self.up, 'reg_scale': self.reg_scale}
+        else:
+            out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]}
+
+        if self.training and self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss2(out_logits[:-1], out_bboxes[:-1], out_corners[:-1], out_refs[:-1],
+                                                     out_corners[-1], out_logits[-1])
+            out['enc_aux_outputs'] = self._set_aux_loss(enc_topk_logits_list, enc_topk_bboxes_list)
+            out['pre_outputs'] = {'pred_logits': pre_logits, 'pred_boxes': pre_bboxes}
+            out['enc_meta'] = {'class_agnostic': self.query_select_method == 'agnostic'}
+
+            if dn_meta is not None:
+                out['dn_outputs'] = self._set_aux_loss2(dn_out_logits, dn_out_bboxes, dn_out_corners, dn_out_refs,
+                                                        dn_out_corners[-1], dn_out_logits[-1])
+                out['dn_pre_outputs'] = {'pred_logits': dn_pre_logits, 'pred_boxes': dn_pre_bboxes}
+                out['dn_meta'] = dn_meta
+
+        return out
+
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_boxes': b} for a, b in zip(outputs_class, outputs_coord)]
+
+
+    @torch.jit.unused
+    def _set_aux_loss2(self, outputs_class, outputs_coord, outputs_corners, outputs_ref,
+                       teacher_corners=None, teacher_logits=None):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_boxes': b, 'pred_corners': c, 'ref_points': d,
+                     'teacher_corners': teacher_corners, 'teacher_logits': teacher_logits}
+                for a, b, c, d in zip(outputs_class, outputs_coord, outputs_corners, outputs_ref)]
diff --git a/deim/_engine/engine/deim/dfine_utils.py b/deim/_engine/engine/deim/dfine_utils.py
new file mode 100644
index 00000000..c0864e20
--- /dev/null
+++ b/deim/_engine/engine/deim/dfine_utils.py
@@ -0,0 +1,156 @@
+"""
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import torch
+from .box_ops import box_xyxy_to_cxcywh
+
+
+def weighting_function(reg_max, up, reg_scale, deploy=False):
+    """
+    Generates the non-uniform Weighting Function W(n) for bounding box regression.
+
+    Args:
+        reg_max (int): Max number of the discrete bins.
+        up (Tensor): Controls upper bounds of the sequence,
+                     where maximum offset is ±up * H / W.
+        reg_scale (float): Controls the curvature of the Weighting Function.
+                           Larger values result in flatter weights near the central axis W(reg_max/2)=0
+                           and steeper weights at both ends.
+        deploy (bool): If True, uses deployment mode settings.
+
+    Returns:
+        Tensor: Sequence of Weighting Function.
+    """
+    if deploy:
+        upper_bound1 = (abs(up[0]) * abs(reg_scale)).item()
+        upper_bound2 = (abs(up[0]) * abs(reg_scale) * 2).item()
+        step = (upper_bound1 + 1) ** (2 / (reg_max - 2))
+        left_values = [-(step) ** i + 1 for i in range(reg_max // 2 - 1, 0, -1)]
+        right_values = [(step) ** i - 1 for i in range(1, reg_max // 2)]
+        values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2]
+        return torch.tensor(values, dtype=up.dtype, device=up.device)
+    else:
+        upper_bound1 = abs(up[0]) * abs(reg_scale)
+        upper_bound2 = abs(up[0]) * abs(reg_scale) * 2
+        step = (upper_bound1 + 1) ** (2 / (reg_max - 2))
+        left_values = [-(step) ** i + 1 for i in range(reg_max // 2 - 1, 0, -1)]
+        right_values = [(step) ** i - 1 for i in range(1, reg_max // 2)]
+        values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2]
+        return torch.cat(values, 0)
+
+
+def translate_gt(gt, reg_max, reg_scale, up):
+    """
+    Decodes bounding box ground truth (GT) values into distribution-based GT representations.
+
+    This function maps continuous GT values into discrete distribution bins, which can be used
+    for regression tasks in object detection models. It calculates the indices of the closest
+    bins to each GT value and assigns interpolation weights to these bins based on their proximity
+    to the GT value.
+
+    Args:
+        gt (Tensor): Ground truth bounding box values, shape (N, ).
+        reg_max (int): Maximum number of discrete bins for the distribution.
+        reg_scale (float): Controls the curvature of the Weighting Function.
+        up (Tensor): Controls the upper bounds of the Weighting Function.
+
+    Returns:
+        Tuple[Tensor, Tensor, Tensor]:
+            - indices (Tensor): Index of the left bin closest to each GT value, shape (N, ).
+            - weight_right (Tensor): Weight assigned to the right bin, shape (N, ).
+            - weight_left (Tensor): Weight assigned to the left bin, shape (N, ).
+    """
+    gt = gt.reshape(-1)
+    function_values = weighting_function(reg_max, up, reg_scale)
+
+    # Find the closest left-side indices for each value
+    diffs = function_values.unsqueeze(0) - gt.unsqueeze(1)
+    mask = diffs <= 0
+    closest_left_indices = torch.sum(mask, dim=1) - 1
+
+    # Calculate the weights for the interpolation
+    indices = closest_left_indices.float()
+
+    weight_right = torch.zeros_like(indices)
+    weight_left = torch.zeros_like(indices)
+
+    valid_idx_mask = (indices >= 0) & (indices < reg_max)
+    valid_indices = indices[valid_idx_mask].long()
+
+    # Obtain distances
+    left_values = function_values[valid_indices]
+    right_values = function_values[valid_indices + 1]
+
+    left_diffs = torch.abs(gt[valid_idx_mask] - left_values)
+    right_diffs = torch.abs(right_values - gt[valid_idx_mask])
+
+    # Valid weights
+    weight_right[valid_idx_mask] = left_diffs / (left_diffs + right_diffs)
+    weight_left[valid_idx_mask] = 1.0 - weight_right[valid_idx_mask]
+
+    # Invalid weights (out of range)
+    invalid_idx_mask_neg = (indices < 0)
+    weight_right[invalid_idx_mask_neg] = 0.0
+    weight_left[invalid_idx_mask_neg] = 1.0
+    indices[invalid_idx_mask_neg] = 0.0
+
+    invalid_idx_mask_pos = (indices >= reg_max)
+    weight_right[invalid_idx_mask_pos] = 1.0
+    weight_left[invalid_idx_mask_pos] = 0.0
+    indices[invalid_idx_mask_pos] = reg_max - 0.1
+
+    return indices, weight_right, weight_left
+
+
+def distance2bbox(points, distance, reg_scale):
+    """
+    Decodes edge-distances into bounding box coordinates.
+
+    Args:
+        points (Tensor): (B, N, 4) or (N, 4) format, representing [x, y, w, h],
+                         where (x, y) is the center and (w, h) are width and height.
+        distance (Tensor): (B, N, 4) or (N, 4), representing distances from the
+                           point to the left, top, right, and bottom boundaries.
+
+        reg_scale (float): Controls the curvature of the Weighting Function.
+
+    Returns:
+        Tensor: Bounding boxes in (N, 4) or (B, N, 4) format [cx, cy, w, h].
+    """
+    reg_scale = abs(reg_scale)
+    x1 = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale)
+    y1 = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale)
+    x2 = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale)
+    y2 = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale)
+
+    bboxes = torch.stack([x1, y1, x2, y2], -1)
+
+    return box_xyxy_to_cxcywh(bboxes)
+
+
+def bbox2distance(points, bbox, reg_max, reg_scale, up, eps=0.1):
+    """
+    Converts bounding box coordinates to distances from a reference point.
+
+    Args:
+        points (Tensor): (n, 4) [x, y, w, h], where (x, y) is the center.
+        bbox (Tensor): (n, 4) bounding boxes in "xyxy" format.
+        reg_max (float): Maximum bin value.
+        reg_scale (float): Controling curvarture of W(n).
+        up (Tensor): Controling upper bounds of W(n).
+        eps (float): Small value to ensure target < reg_max.
+
+    Returns:
+        Tensor: Decoded distances.
+    """
+    reg_scale = abs(reg_scale)
+    left   = (points[:, 0] - bbox[:, 0]) / (points[..., 2] / reg_scale + 1e-16) - 0.5 * reg_scale
+    top    = (points[:, 1] - bbox[:, 1]) / (points[..., 3] / reg_scale + 1e-16) - 0.5 * reg_scale
+    right  = (bbox[:, 2] - points[:, 0]) / (points[..., 2] / reg_scale + 1e-16) - 0.5 * reg_scale
+    bottom = (bbox[:, 3] - points[:, 1]) / (points[..., 3] / reg_scale + 1e-16) - 0.5 * reg_scale
+    four_lens = torch.stack([left, top, right, bottom], -1)
+    four_lens, weight_right, weight_left = translate_gt(four_lens, reg_max, reg_scale, up)
+    if reg_max is not None:
+        four_lens = four_lens.clamp(min=0, max=reg_max-eps)
+    return four_lens.reshape(-1).detach(), weight_right.detach(), weight_left.detach()
diff --git a/deim/_engine/engine/deim/hybrid_encoder.py b/deim/_engine/engine/deim/hybrid_encoder.py
new file mode 100644
index 00000000..14b49159
--- /dev/null
+++ b/deim/_engine/engine/deim/hybrid_encoder.py
@@ -0,0 +1,433 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE/)
+Copyright (c) 2024 D-FINE Authors. All Rights Reserved.
+"""
+
+import copy
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .utils import get_activation
+
+from ..core import register
+
+__all__ = ['HybridEncoder']
+
+
+class ConvNormLayer_fuse(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, g=1, padding=None, bias=False, act=None):
+        super().__init__()
+        padding = (kernel_size-1)//2 if padding is None else padding
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size,
+            stride,
+            groups=g,
+            padding=padding,
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act)
+        self.ch_in, self.ch_out, self.kernel_size, self.stride, self.g, self.padding, self.bias = \
+            ch_in, ch_out, kernel_size, stride, g, padding, bias
+
+    def forward(self, x):
+        if hasattr(self, 'conv_bn_fused'):
+            y = self.conv_bn_fused(x)
+        else:
+            y = self.norm(self.conv(x))
+        return self.act(y)
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv_bn_fused'):
+            self.conv_bn_fused = nn.Conv2d(
+                self.ch_in,
+                self.ch_out,
+                self.kernel_size,
+                self.stride,
+                groups=self.g,
+                padding=self.padding,
+                bias=True)
+
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv_bn_fused.weight.data = kernel
+        self.conv_bn_fused.bias.data = bias
+        self.__delattr__('conv')
+        self.__delattr__('norm')
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor()
+
+        return kernel3x3, bias3x3
+
+    def _fuse_bn_tensor(self):
+        kernel = self.conv.weight
+        running_mean = self.norm.running_mean
+        running_var = self.norm.running_var
+        gamma = self.norm.weight
+        beta = self.norm.bias
+        eps = self.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, g=1, padding=None, bias=False, act=None):
+        super().__init__()
+        padding = (kernel_size-1)//2 if padding is None else padding
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size,
+            stride,
+            groups=g,
+            padding=padding,
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act)
+
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+
+
+# TODO, add activation for cv1 following YOLOv10
+# self.cv1 = Conv(c1, c2, 1, 1)
+# self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
+class SCDown(nn.Module):
+    def __init__(self, c1, c2, k, s, act=None):
+        super().__init__()
+        self.cv1 = ConvNormLayer_fuse(c1, c2, 1, 1)
+        self.cv2 = ConvNormLayer_fuse(c2, c2, k, s, c2)
+
+    def forward(self, x):
+        return self.cv2(self.cv1(x))
+
+
+class VGGBlock(nn.Module):
+    def __init__(self, ch_in, ch_out, act='relu'):
+        super().__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvNormLayer(ch_in, ch_out, 3, 1, padding=1, act=None)
+        self.conv2 = ConvNormLayer(ch_in, ch_out, 1, 1, padding=0, act=None)
+        self.act = nn.Identity() if act is None else get_activation(act)
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            y = self.conv1(x) + self.conv2(x)
+
+        return self.act(y)
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1)
+
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias
+        self.__delattr__('conv1')
+        self.__delattr__('conv2')
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return F.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch: ConvNormLayer):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.norm.running_mean
+        running_var = branch.norm.running_var
+        gamma = branch.norm.weight
+        beta = branch.norm.bias
+        eps = branch.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class CSPLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=3,
+                 expansion=1.0,
+                 bias=False,
+                 act="silu",
+                 bottletype=VGGBlock):
+        super(CSPLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = ConvNormLayer_fuse(in_channels, hidden_channels, 1, 1, bias=bias, act=act)
+        self.conv2 = ConvNormLayer_fuse(in_channels, hidden_channels, 1, 1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(*[
+            bottletype(hidden_channels, hidden_channels, act=act) for _ in range(num_blocks)
+        ])
+        if hidden_channels != out_channels:
+            self.conv3 = ConvNormLayer_fuse(hidden_channels, out_channels, 1, 1, bias=bias, act=act)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, x):
+        x_2 = self.conv2(x)
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        return self.conv3(x_1 + x_2)
+
+class RepNCSPELAN4(nn.Module):
+    # csp-elan
+    def __init__(self, c1, c2, c3, c4, n=3,
+                 bias=False,
+                 act="silu"):
+        super().__init__()
+        self.c = c3//2
+        self.cv1 = ConvNormLayer_fuse(c1, c3, 1, 1, bias=bias, act=act)
+        self.cv2 = nn.Sequential(CSPLayer(c3//2, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
+        self.cv3 = nn.Sequential(CSPLayer(c4, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
+        self.cv4 = ConvNormLayer_fuse(c3+(2*c4), c2, 1, 1, bias=bias, act=act)
+
+    def forward_chunk(self, x):
+        y = list(self.cv1(x).chunk(2, 1))
+        y.extend((m(y[-1])) for m in [self.cv2, self.cv3])
+        return self.cv4(torch.cat(y, 1))
+
+    def forward(self, x):
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
+        return self.cv4(torch.cat(y, 1))
+
+
+# transformer
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=False):
+        super().__init__()
+        self.normalize_before = normalize_before
+
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True)
+
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = get_activation(activation)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+@register()
+class HybridEncoder(nn.Module):
+    __share__ = ['eval_spatial_size', ]
+
+    def __init__(self,
+                 in_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 hidden_dim=256,
+                 nhead=8,
+                 dim_feedforward = 1024,
+                 dropout=0.0,
+                 enc_act='gelu',
+                 use_encoder_idx=[2],
+                 num_encoder_layers=1,
+                 pe_temperature=10000,
+                 expansion=1.0,
+                 depth_mult=1.0,
+                 act='silu',
+                 eval_spatial_size=None,
+                 version='dfine',
+                 ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.feat_strides = feat_strides
+        self.hidden_dim = hidden_dim
+        self.use_encoder_idx = use_encoder_idx
+        self.num_encoder_layers = num_encoder_layers
+        self.pe_temperature = pe_temperature
+        self.eval_spatial_size = eval_spatial_size
+        self.out_channels = [hidden_dim for _ in range(len(in_channels))]
+        self.out_strides = feat_strides
+
+        # channel projection
+        self.input_proj = nn.ModuleList()
+        for in_channel in in_channels:
+            proj = nn.Sequential(OrderedDict([
+                    ('conv', nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False)),
+                    ('norm', nn.BatchNorm2d(hidden_dim))
+                ]))
+
+            self.input_proj.append(proj)
+
+        # encoder transformer
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            activation=enc_act
+            )
+
+        self.encoder = nn.ModuleList([
+            TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers) for _ in range(len(use_encoder_idx))
+        ])
+
+        # top-down fpn
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(len(in_channels) - 1, 0, -1):
+            # TODO, add activation for those lateral convs
+            if version == 'dfine':
+                self.lateral_convs.append(ConvNormLayer_fuse(hidden_dim, hidden_dim, 1, 1))
+            else:
+                self.lateral_convs.append(ConvNormLayer_fuse(hidden_dim, hidden_dim, 1, 1, act=act))
+            self.fpn_blocks.append(
+                RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(expansion * hidden_dim // 2), round(3 * depth_mult), act=act) \
+                if version == 'dfine' else CSPLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion, bottletype=VGGBlock)
+            )
+
+        # bottom-up pan
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(len(in_channels) - 1):
+            self.downsample_convs.append(
+                nn.Sequential(SCDown(hidden_dim, hidden_dim, 3, 2, act=act)) \
+                if version == 'dfine' else ConvNormLayer_fuse(hidden_dim, hidden_dim, 3, 2, act=act)
+            )
+            self.pan_blocks.append(
+                RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(expansion * hidden_dim // 2), round(3 * depth_mult), act=act) \
+                if version == 'dfine' else CSPLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion, bottletype=VGGBlock)
+            )
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self.eval_spatial_size:
+            for idx in self.use_encoder_idx:
+                stride = self.feat_strides[idx]
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    self.eval_spatial_size[1] // stride, self.eval_spatial_size[0] // stride,
+                    self.hidden_dim, self.pe_temperature)
+                setattr(self, f'pos_embed{idx}', pos_embed)
+                # self.register_buffer(f'pos_embed{idx}', pos_embed)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.):
+        """
+        """
+        grid_w = torch.arange(int(w), dtype=torch.float32)
+        grid_h = torch.arange(int(h), dtype=torch.float32)
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
+        assert embed_dim % 4 == 0, \
+            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+        omega = 1. / (temperature ** omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None]
+        out_h = grid_h.flatten()[..., None] @ omega[None]
+
+        return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :]
+
+    def forward(self, feats):
+        assert len(feats) == len(self.in_channels)
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+
+        # encoder
+        if self.num_encoder_layers > 0:
+            for i, enc_ind in enumerate(self.use_encoder_idx):
+                h, w = proj_feats[enc_ind].shape[2:]
+                # flatten [B, C, H, W] to [B, HxW, C]
+                src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1)
+                if self.training or self.eval_spatial_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        w, h, self.hidden_dim, self.pe_temperature).to(src_flatten.device)
+                else:
+                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None).to(src_flatten.device)
+
+                memory :torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
+                proj_feats[enc_ind] = memory.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous()
+
+        # broadcasting and fusion
+        inner_outs = [proj_feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = proj_feats[idx - 1]
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
+            inner_outs[0] = feat_heigh
+            upsample_feat = F.interpolate(feat_heigh, scale_factor=2., mode='nearest')
+            inner_out = self.fpn_blocks[len(self.in_channels)-1-idx](torch.concat([upsample_feat, feat_low], dim=1))
+            inner_outs.insert(0, inner_out)
+
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_height], dim=1))
+            outs.append(out)
+
+        return outs
diff --git a/engine/deim/matcher.py b/deim/_engine/engine/deim/matcher.py
similarity index 100%
rename from engine/deim/matcher.py
rename to deim/_engine/engine/deim/matcher.py
diff --git a/engine/deim/postprocessor.py b/deim/_engine/engine/deim/postprocessor.py
similarity index 100%
rename from engine/deim/postprocessor.py
rename to deim/_engine/engine/deim/postprocessor.py
diff --git a/engine/deim/rtdetrv2_decoder.py b/deim/_engine/engine/deim/rtdetrv2_decoder.py
similarity index 100%
rename from engine/deim/rtdetrv2_decoder.py
rename to deim/_engine/engine/deim/rtdetrv2_decoder.py
diff --git a/deim/_engine/engine/deim/utils.py b/deim/_engine/engine/deim/utils.py
new file mode 100644
index 00000000..2c20c6f7
--- /dev/null
+++ b/deim/_engine/engine/deim/utils.py
@@ -0,0 +1,182 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2023 . All Rights Reserved.
+"""
+
+import math
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def inverse_sigmoid(x: torch.Tensor, eps: float=1e-5) -> torch.Tensor:
+    x = x.clip(min=0., max=1.)
+    return torch.log(x.clip(min=eps) / (1 - x).clip(min=eps))
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-math.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+def deformable_attention_core_func(value, value_spatial_shapes, sampling_locations, attention_weights):
+    """
+    Args:
+        value (Tensor): [bs, value_length, n_head, c]
+        value_spatial_shapes (Tensor|List): [n_levels, 2]
+        value_level_start_index (Tensor|List): [n_levels]
+        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
+        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
+
+    Returns:
+        output (Tensor): [bs, Length_{query}, C]
+    """
+    bs, _, n_head, c = value.shape
+    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
+
+    split_shape = [h * w for h, w in value_spatial_shapes]
+    value_list = value.split(split_shape, dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (h, w) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[level].flatten(2).permute(
+            0, 2, 1).reshape(bs * n_head, c, h, w)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level].permute(
+            0, 2, 1, 3, 4).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.permute(0, 2, 1, 3, 4).reshape(
+        bs * n_head, 1, Len_q, n_levels * n_points)
+    output = (torch.stack(
+        sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).reshape(bs, n_head * c, Len_q)
+
+    return output.permute(0, 2, 1)
+
+
+
+def deformable_attention_core_func_v2(\
+    value: torch.Tensor,
+    value_spatial_shapes,
+    sampling_locations: torch.Tensor,
+    attention_weights: torch.Tensor,
+    num_points_list: List[int],
+    method='default',
+    value_shape='default',
+    ):
+    """
+    Args:
+        value (Tensor): [bs, value_length, n_head, c]
+        value_spatial_shapes (Tensor|List): [n_levels, 2]
+        value_level_start_index (Tensor|List): [n_levels]
+        sampling_locations (Tensor): [bs, query_length, n_head, n_levels * n_points, 2]
+        attention_weights (Tensor): [bs, query_length, n_head, n_levels * n_points]
+
+    Returns:
+        output (Tensor): [bs, Length_{query}, C]
+    """
+    # TODO find the version
+    if value_shape == 'default':
+        bs, n_head, c, _ = value[0].shape
+    elif value_shape == 'reshape':   # reshape following RT-DETR
+        bs, _, n_head, c = value.shape
+        split_shape = [h * w for h, w in value_spatial_shapes]
+        value = value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1)
+    _, Len_q, _, _, _ = sampling_locations.shape
+
+    # sampling_offsets [8, 480, 8, 12, 2]
+    if method == 'default':
+        sampling_grids = 2 * sampling_locations - 1
+
+    elif method == 'discrete':
+        sampling_grids = sampling_locations
+
+    sampling_grids = sampling_grids.permute(0, 2, 1, 3, 4).flatten(0, 1)
+    sampling_locations_list = sampling_grids.split(num_points_list, dim=-2)
+
+    sampling_value_list = []
+    for level, (h, w) in enumerate(value_spatial_shapes):
+        value_l = value[level].reshape(bs * n_head, c, h, w)
+        sampling_grid_l: torch.Tensor = sampling_locations_list[level]
+
+        if method == 'default':
+            sampling_value_l = F.grid_sample(
+                value_l,
+                sampling_grid_l,
+                mode='bilinear',
+                padding_mode='zeros',
+                align_corners=False)
+
+        elif method == 'discrete':
+            # n * m, seq, n, 2
+            sampling_coord = (sampling_grid_l * torch.tensor([[w, h]], device=value_l.device) + 0.5).to(torch.int64)
+
+            # FIX ME? for rectangle input
+            sampling_coord = sampling_coord.clamp(0, h - 1)
+            sampling_coord = sampling_coord.reshape(bs * n_head, Len_q * num_points_list[level], 2)
+
+            s_idx = torch.arange(sampling_coord.shape[0], device=value_l.device).unsqueeze(-1).repeat(1, sampling_coord.shape[1])
+            sampling_value_l: torch.Tensor = value_l[s_idx, :, sampling_coord[..., 1], sampling_coord[..., 0]] # n l c
+
+            sampling_value_l = sampling_value_l.permute(0, 2, 1).reshape(bs * n_head, c, Len_q, num_points_list[level])
+
+        sampling_value_list.append(sampling_value_l)
+
+    attn_weights = attention_weights.permute(0, 2, 1, 3).reshape(bs * n_head, 1, Len_q, sum(num_points_list))
+    weighted_sample_locs = torch.concat(sampling_value_list, dim=-1) * attn_weights
+    output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, Len_q)
+
+    return output.permute(0, 2, 1)
+
+
+def get_activation(act: str, inpace: bool=True):
+    """get activation
+    """
+    if act is None:
+        return nn.Identity()
+
+    elif isinstance(act, nn.Module):
+        return act
+
+    act = act.lower()
+
+    if act == 'silu' or act == 'swish':
+        m = nn.SiLU()
+
+    elif act == 'relu':
+        m = nn.ReLU()
+
+    elif act == 'leaky_relu':
+        m = nn.LeakyReLU()
+
+    elif act == 'silu':
+        m = nn.SiLU()
+
+    elif act == 'gelu':
+        m = nn.GELU()
+
+    elif act == 'hardsigmoid':
+        m = nn.Hardsigmoid()
+
+    else:
+        raise RuntimeError('')
+
+    if hasattr(m, 'inplace'):
+        m.inplace = inpace
+
+    return m
diff --git a/engine/misc/__init__.py b/deim/_engine/engine/misc/__init__.py
similarity index 100%
rename from engine/misc/__init__.py
rename to deim/_engine/engine/misc/__init__.py
diff --git a/engine/misc/box_ops.py b/deim/_engine/engine/misc/box_ops.py
similarity index 100%
rename from engine/misc/box_ops.py
rename to deim/_engine/engine/misc/box_ops.py
diff --git a/engine/misc/dist_utils.py b/deim/_engine/engine/misc/dist_utils.py
similarity index 100%
rename from engine/misc/dist_utils.py
rename to deim/_engine/engine/misc/dist_utils.py
diff --git a/engine/misc/lazy_loader.py b/deim/_engine/engine/misc/lazy_loader.py
similarity index 100%
rename from engine/misc/lazy_loader.py
rename to deim/_engine/engine/misc/lazy_loader.py
diff --git a/engine/misc/logger.py b/deim/_engine/engine/misc/logger.py
similarity index 100%
rename from engine/misc/logger.py
rename to deim/_engine/engine/misc/logger.py
diff --git a/engine/misc/profiler_utils.py b/deim/_engine/engine/misc/profiler_utils.py
similarity index 100%
rename from engine/misc/profiler_utils.py
rename to deim/_engine/engine/misc/profiler_utils.py
diff --git a/engine/misc/visualizer.py b/deim/_engine/engine/misc/visualizer.py
similarity index 100%
rename from engine/misc/visualizer.py
rename to deim/_engine/engine/misc/visualizer.py
diff --git a/engine/optim/__init__.py b/deim/_engine/engine/optim/__init__.py
similarity index 100%
rename from engine/optim/__init__.py
rename to deim/_engine/engine/optim/__init__.py
diff --git a/engine/optim/amp.py b/deim/_engine/engine/optim/amp.py
similarity index 100%
rename from engine/optim/amp.py
rename to deim/_engine/engine/optim/amp.py
diff --git a/engine/optim/ema.py b/deim/_engine/engine/optim/ema.py
similarity index 100%
rename from engine/optim/ema.py
rename to deim/_engine/engine/optim/ema.py
diff --git a/engine/optim/lr_scheduler.py b/deim/_engine/engine/optim/lr_scheduler.py
similarity index 100%
rename from engine/optim/lr_scheduler.py
rename to deim/_engine/engine/optim/lr_scheduler.py
diff --git a/engine/optim/optim.py b/deim/_engine/engine/optim/optim.py
similarity index 100%
rename from engine/optim/optim.py
rename to deim/_engine/engine/optim/optim.py
diff --git a/engine/optim/warmup.py b/deim/_engine/engine/optim/warmup.py
similarity index 100%
rename from engine/optim/warmup.py
rename to deim/_engine/engine/optim/warmup.py
diff --git a/engine/solver/__init__.py b/deim/_engine/engine/solver/__init__.py
similarity index 100%
rename from engine/solver/__init__.py
rename to deim/_engine/engine/solver/__init__.py
diff --git a/engine/solver/_solver.py b/deim/_engine/engine/solver/_solver.py
similarity index 100%
rename from engine/solver/_solver.py
rename to deim/_engine/engine/solver/_solver.py
diff --git a/engine/solver/clas_engine.py b/deim/_engine/engine/solver/clas_engine.py
similarity index 100%
rename from engine/solver/clas_engine.py
rename to deim/_engine/engine/solver/clas_engine.py
diff --git a/engine/solver/clas_solver.py b/deim/_engine/engine/solver/clas_solver.py
similarity index 100%
rename from engine/solver/clas_solver.py
rename to deim/_engine/engine/solver/clas_solver.py
diff --git a/engine/solver/det_engine.py b/deim/_engine/engine/solver/det_engine.py
similarity index 100%
rename from engine/solver/det_engine.py
rename to deim/_engine/engine/solver/det_engine.py
diff --git a/engine/solver/det_solver.py b/deim/_engine/engine/solver/det_solver.py
similarity index 100%
rename from engine/solver/det_solver.py
rename to deim/_engine/engine/solver/det_solver.py
diff --git a/deim/_engine/misc/__init__.py b/deim/_engine/misc/__init__.py
new file mode 100644
index 00000000..acd6b469
--- /dev/null
+++ b/deim/_engine/misc/__init__.py
@@ -0,0 +1,9 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from .logger import *
+from .visualizer import *
+from .dist_utils import setup_seed, setup_print
+from .profiler_utils import stats
diff --git a/deim/_engine/misc/box_ops.py b/deim/_engine/misc/box_ops.py
new file mode 100644
index 00000000..bdaa0cf8
--- /dev/null
+++ b/deim/_engine/misc/box_ops.py
@@ -0,0 +1,105 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torchvision
+from torch import Tensor
+from typing import List, Tuple
+
+
+def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    return torchvision.ops.generalized_box_iou(boxes1, boxes2)
+
+
+# elementwise
+def elementwise_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    """
+    Args:
+        boxes1, [N, 4]
+        boxes2, [N, 4]
+    Returns:
+        iou, [N, ]
+        union, [N, ]
+    """
+    area1 = torchvision.ops.box_area(boxes1) # [N, ]
+    area2 = torchvision.ops.box_area(boxes2) # [N, ]
+    lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N, 2]
+    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N, 2]
+    wh = (rb - lt).clamp(min=0)  # [N, 2]
+    inter = wh[:, 0] * wh[:, 1]  # [N, ]
+    union = area1 + area2 - inter
+    iou = inter / union
+    return iou, union
+
+
+def elementwise_generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    """
+    Args:
+        boxes1, [N, 4] with [x1, y1, x2, y2]
+        boxes2, [N, 4] with [x1, y1, x2, y2]
+    Returns:
+        giou, [N, ]
+    """
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = elementwise_box_iou(boxes1, boxes2)
+    lt = torch.min(boxes1[:, :2], boxes2[:, :2]) # [N, 2]
+    rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) # [N, 2]
+    wh = (rb - lt).clamp(min=0)  # [N, 2]
+    area = wh[:, 0] * wh[:, 1]
+    return iou - (area - union) / area
+
+
+def check_point_inside_box(points: Tensor, boxes: Tensor, eps=1e-9) -> Tensor:
+    """
+    Args:
+        points, [K, 2], (x, y)
+        boxes, [N, 4], (x1, y1, y2, y2)
+    Returns:
+        Tensor (bool), [K, N]
+    """
+    x, y = [p.unsqueeze(-1) for p in points.unbind(-1)]
+    x1, y1, x2, y2 = [x.unsqueeze(0) for x in boxes.unbind(-1)]
+
+    l = x - x1
+    t = y - y1
+    r = x2 - x
+    b = y2 - y
+
+    ltrb = torch.stack([l, t, r, b], dim=-1)
+    mask = ltrb.min(dim=-1).values > eps
+
+    return mask
+
+
+def point_box_distance(points: Tensor, boxes: Tensor) -> Tensor:
+    """
+    Args:
+        boxes, [N, 4], (x1, y1, x2, y2)
+        points, [N, 2], (x, y)
+    Returns:
+        Tensor (N, 4), (l, t, r, b)
+    """
+    x1y1, x2y2 = torch.split(boxes, 2, dim=-1)
+    lt = points - x1y1
+    rb = x2y2 - points
+    return torch.concat([lt, rb], dim=-1)
+
+
+def point_distance_box(points: Tensor, distances: Tensor) -> Tensor:
+    """
+    Args:
+        points (Tensor), [N, 2], (x, y)
+        distances (Tensor), [N, 4], (l, t, r, b)
+    Returns:
+        boxes (Tensor),  (N, 4), (x1, y1, x2, y2)
+    """
+    lt, rb = torch.split(distances, 2, dim=-1)
+    x1y1 = -lt + points
+    x2y2 = rb + points
+    boxes = torch.concat([x1y1, x2y2], dim=-1)
+    return boxes
diff --git a/deim/_engine/misc/dist_utils.py b/deim/_engine/misc/dist_utils.py
new file mode 100644
index 00000000..368d4353
--- /dev/null
+++ b/deim/_engine/misc/dist_utils.py
@@ -0,0 +1,268 @@
+"""
+reference
+- https://github.com/pytorch/vision/blob/main/references/detection/utils.py
+- https://github.com/facebookresearch/detr/blob/master/util/misc.py#L406
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import os
+import time
+import random
+import numpy as np
+import atexit
+
+import torch
+import torch.nn as nn
+import torch.distributed
+import torch.backends.cudnn
+
+from torch.nn.parallel import DataParallel as DP
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+from torch.utils.data import DistributedSampler
+# from torch.utils.data.dataloader import DataLoader
+from ..data import DataLoader
+
+
+def setup_distributed(print_rank: int=0, print_method: str='builtin', seed: int=None, ):
+    """
+    env setup
+    args:
+        print_rank,
+        print_method, (builtin, rich)
+        seed,
+    """
+    try:
+        # https://pytorch.org/docs/stable/elastic/run.html
+        RANK = int(os.getenv('RANK', -1))
+        LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))
+        WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
+
+        # torch.distributed.init_process_group(backend=backend, init_method='env://')
+        torch.distributed.init_process_group(init_method='env://')
+        torch.distributed.barrier()
+
+        rank = torch.distributed.get_rank()
+        torch.cuda.set_device(rank)
+        torch.cuda.empty_cache()
+        enabled_dist = True
+        if get_rank() == print_rank:
+            print('Initialized distributed mode...')
+
+    except Exception:
+        enabled_dist = False
+        print('Not init distributed mode.')
+
+    setup_print(get_rank() == print_rank, method=print_method)
+    if seed is not None:
+        setup_seed(seed)
+
+    return enabled_dist
+
+
+def setup_print(is_main, method='builtin'):
+    """This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+
+    if method == 'builtin':
+        builtin_print = __builtin__.print
+
+    elif method == 'rich':
+        import rich
+        builtin_print = rich.print
+
+    else:
+        raise AttributeError('')
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_main or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_available_and_initialized():
+    if not torch.distributed.is_available():
+        return False
+    if not torch.distributed.is_initialized():
+        return False
+    return True
+
+
+@atexit.register
+def cleanup():
+    """cleanup distributed environment
+    """
+    if is_dist_available_and_initialized():
+        torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
+
+
+def get_rank():
+    if not is_dist_available_and_initialized():
+        return 0
+    return torch.distributed.get_rank()
+
+
+def get_world_size():
+    if not is_dist_available_and_initialized():
+        return 1
+    return torch.distributed.get_world_size()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+
+def warp_model(
+    model: torch.nn.Module,
+    sync_bn: bool=False,
+    dist_mode: str='ddp',
+    find_unused_parameters: bool=False,
+    compile: bool=False,
+    compile_mode: str='reduce-overhead',
+    **kwargs
+):
+    if is_dist_available_and_initialized():
+        rank = get_rank()
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if sync_bn else model
+        if dist_mode == 'dp':
+            model = DP(model, device_ids=[rank], output_device=rank)
+        elif dist_mode == 'ddp':
+            model = DDP(model, device_ids=[rank], output_device=rank, find_unused_parameters=find_unused_parameters)
+        else:
+            raise AttributeError('')
+
+    if compile:
+        model = torch.compile(model, mode=compile_mode)
+
+    return model
+
+def de_model(model):
+    return de_parallel(de_complie(model))
+
+
+def warp_loader(loader, shuffle=False):
+    if is_dist_available_and_initialized():
+        sampler = DistributedSampler(loader.dataset, shuffle=shuffle)
+        loader = DataLoader(loader.dataset,
+                            loader.batch_size,
+                            sampler=sampler,
+                            drop_last=loader.drop_last,
+                            collate_fn=loader.collate_fn,
+                            pin_memory=loader.pin_memory,
+                            num_workers=loader.num_workers)
+    return loader
+
+
+
+def is_parallel(model) -> bool:
+    # Returns True if model is of type DP or DDP
+    return type(model) in (torch.nn.parallel.DataParallel, torch.nn.parallel.DistributedDataParallel)
+
+
+def de_parallel(model) -> nn.Module:
+    # De-parallelize a model: returns single-GPU model if model is of type DP or DDP
+    return model.module if is_parallel(model) else model
+
+
+def reduce_dict(data, avg=True):
+    """
+    Args
+        data dict: input, {k: v, ...}
+        avg bool: true
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return data
+
+    with torch.no_grad():
+        keys, values = [], []
+        for k in sorted(data.keys()):
+            keys.append(k)
+            values.append(data[k])
+
+        values = torch.stack(values, dim=0)
+        torch.distributed.all_reduce(values)
+
+        if avg is True:
+            values /= world_size
+
+        return {k: v for k, v in zip(keys, values)}
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    data_list = [None] * world_size
+    torch.distributed.all_gather_object(data_list, data)
+    return data_list
+
+
+def sync_time():
+    """sync_time
+    """
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+    return time.time()
+
+
+
+def setup_seed(seed: int, deterministic=False):
+    """setup_seed for reproducibility
+    torch.manual_seed(3407) is all you need. https://arxiv.org/abs/2109.08203
+    """
+    seed = seed + get_rank()
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+    # memory will be large when setting deterministic to True
+    if torch.backends.cudnn.is_available() and deterministic:
+        torch.backends.cudnn.deterministic = True
+
+
+# for torch.compile
+def check_compile():
+    import torch
+    import warnings
+    gpu_ok = False
+    if torch.cuda.is_available():
+        device_cap = torch.cuda.get_device_capability()
+        if device_cap in ((7, 0), (8, 0), (9, 0)):
+            gpu_ok = True
+    if not gpu_ok:
+        warnings.warn(
+            "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower "
+            "than expected."
+        )
+    return gpu_ok
+
+def is_compile(model):
+    import torch._dynamo
+    return type(model) in (torch._dynamo.OptimizedModule, )
+
+def de_complie(model):
+    return model._orig_mod if is_compile(model) else model
diff --git a/deim/_engine/misc/lazy_loader.py b/deim/_engine/misc/lazy_loader.py
new file mode 100644
index 00000000..e99ce599
--- /dev/null
+++ b/deim/_engine/misc/lazy_loader.py
@@ -0,0 +1,70 @@
+"""
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/util/lazy_loader.py
+"""
+
+
+import types
+import importlib
+
+class LazyLoader(types.ModuleType):
+  """Lazily import a module, mainly to avoid pulling in large dependencies.
+
+  `paddle`, and `ffmpeg` are examples of modules that are large and not always
+  needed, and this allows them to only be loaded when they are used.
+  """
+
+  # The lint error here is incorrect.
+  def __init__(self, local_name, parent_module_globals, name, warning=None):
+    self._local_name = local_name
+    self._parent_module_globals = parent_module_globals
+    self._warning = warning
+
+    # These members allows doctest correctly process this module member without
+    # triggering self._load(). self._load() mutates parant_module_globals and
+    # triggers a dict mutated during iteration error from doctest.py.
+    # - for from_module()
+    self.__module__ = name.rsplit(".", 1)[0]
+    # - for is_routine()
+    self.__wrapped__ = None
+
+    super(LazyLoader, self).__init__(name)
+
+  def _load(self):
+    """Load the module and insert it into the parent's globals."""
+    # Import the target module and insert it into the parent's namespace
+    module = importlib.import_module(self.__name__)
+    self._parent_module_globals[self._local_name] = module
+
+    # Emit a warning if one was specified
+    if self._warning:
+      # logging.warning(self._warning)
+      # Make sure to only warn once.
+      self._warning = None
+
+    # Update this object's dict so that if someone keeps a reference to the
+    #   LazyLoader, lookups are efficient (__getattr__ is only called on lookups
+    #   that fail).
+    self.__dict__.update(module.__dict__)
+
+    return module
+
+  def __getattr__(self, item):
+    module = self._load()
+    return getattr(module, item)
+
+  def __repr__(self):
+    # Carefully to not trigger _load, since repr may be called in very
+    # sensitive places.
+    return f"<LazyLoader {self.__name__} as {self._local_name}>"
+
+  def __dir__(self):
+    module = self._load()
+    return dir(module)
+
+
+# import paddle.nn as nn
+# nn = LazyLoader("nn", globals(), "paddle.nn")
+
+# class M(nn.Layer):
+#     def __init__(self) -> None:
+#       super().__init__()
diff --git a/deim/_engine/misc/logger.py b/deim/_engine/misc/logger.py
new file mode 100644
index 00000000..fd020905
--- /dev/null
+++ b/deim/_engine/misc/logger.py
@@ -0,0 +1,238 @@
+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+https://github.com/facebookresearch/detr/blob/main/util/misc.py
+Mostly copy-paste from torchvision references.
+"""
+
+import time
+import pickle
+import datetime
+from collections import defaultdict, deque
+from typing import Dict
+
+import torch
+import torch.distributed as tdist
+
+from .dist_utils import is_dist_available_and_initialized, get_world_size
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_available_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        tdist.barrier()
+        tdist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    tdist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    tdist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True) -> Dict[str, torch.Tensor]:
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        tdist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
diff --git a/deim/_engine/misc/profiler_utils.py b/deim/_engine/misc/profiler_utils.py
new file mode 100644
index 00000000..f328cc35
--- /dev/null
+++ b/deim/_engine/misc/profiler_utils.py
@@ -0,0 +1,26 @@
+"""
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+"""
+
+import copy
+from calflops import calculate_flops
+from typing import Tuple
+
+def stats(
+    cfg,
+    input_shape: Tuple=(1, 3, 640, 640), ) -> Tuple[int, dict]:
+
+    base_size = cfg.train_dataloader.collate_fn.base_size
+    input_shape = (1, 3, base_size, base_size)
+
+    model_for_info = copy.deepcopy(cfg.model).deploy()
+
+    flops, macs, _ = calculate_flops(model=model_for_info,
+                                        input_shape=input_shape,
+                                        output_as_string=True,
+                                        output_precision=4,
+                                        print_detailed=False)
+    params = sum(p.numel() for p in model_for_info.parameters())
+    del model_for_info
+
+    return params, {"Model FLOPs:%s   MACs:%s   Params:%s" %(flops, macs, params)}
diff --git a/deim/_engine/misc/visualizer.py b/deim/_engine/misc/visualizer.py
new file mode 100644
index 00000000..4e14eef9
--- /dev/null
+++ b/deim/_engine/misc/visualizer.py
@@ -0,0 +1,33 @@
+""""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import PIL
+import torch
+import torch.utils.data
+import torchvision
+torchvision.disable_beta_transforms_warning()
+
+__all__ = ['show_sample']
+
+def show_sample(sample):
+    """for coco dataset/dataloader
+    """
+    import matplotlib.pyplot as plt
+    from torchvision.transforms.v2 import functional as F
+    from torchvision.utils import draw_bounding_boxes
+
+    image, target = sample
+    if isinstance(image, PIL.Image.Image):
+        image = F.to_image_tensor(image)
+
+    image = F.convert_dtype(image, torch.uint8)
+    annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
+
+    fig, ax = plt.subplots()
+    ax.imshow(annotated_image.permute(1, 2, 0).numpy())
+    ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    fig.tight_layout()
+    fig.show()
+    plt.show()
diff --git a/tools/inference/openvino_inf.py b/deim/_engine/optim/__init__.py
similarity index 58%
rename from tools/inference/openvino_inf.py
rename to deim/_engine/optim/__init__.py
index 4a66755a..a6b3bc6a 100644
--- a/tools/inference/openvino_inf.py
+++ b/deim/_engine/optim/__init__.py
@@ -3,5 +3,7 @@
 Copyright(c) 2023 lyuwenyu. All Rights Reserved.
 """
 
-
-# please reference: https://github.com/guojin-yan/RT-DETR-OpenVINO
+from .ema import *
+from .optim import *
+from .amp import *
+from .warmup import *
diff --git a/deim/_engine/optim/amp.py b/deim/_engine/optim/amp.py
new file mode 100644
index 00000000..6af85e5c
--- /dev/null
+++ b/deim/_engine/optim/amp.py
@@ -0,0 +1,14 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch.cuda.amp as amp
+
+from ..core import register
+
+
+__all__ = ['GradScaler']
+
+GradScaler = register()(amp.grad_scaler.GradScaler)
diff --git a/deim/_engine/optim/ema.py b/deim/_engine/optim/ema.py
new file mode 100644
index 00000000..1c234347
--- /dev/null
+++ b/deim/_engine/optim/ema.py
@@ -0,0 +1,102 @@
+"""
+D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright (c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch
+import torch.nn as nn
+
+import math
+from copy import deepcopy
+
+from ..core import register
+from ..misc import dist_utils
+
+__all__ = ['ModelEMA']
+
+
+@register()
+class ModelEMA(object):
+    """
+    Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    """
+    def __init__(self, model: nn.Module, decay: float=0.9999, warmups: int=1000, start: int=0):
+        super().__init__()
+
+        self.module = deepcopy(dist_utils.de_parallel(model)).eval()
+        # if next(model.parameters()).device.type != 'cpu':
+        #     self.module.half()  # FP16 EMA
+
+        self.decay = decay
+        self.warmups = warmups
+        self.before_start = 0
+        self.start = start
+        self.updates = 0  # number of EMA updates
+        if warmups == 0:
+            self.decay_fn = lambda x: decay
+        else:
+            self.decay_fn = lambda x: decay * (1 - math.exp(-x / warmups))  # decay exponential ramp (to help early epochs)
+
+        for p in self.module.parameters():
+            p.requires_grad_(False)
+
+
+    def update(self, model: nn.Module):
+        if self.before_start < self.start:
+            self.before_start += 1
+            return
+        # Update EMA parameters
+        with torch.no_grad():
+            self.updates += 1
+            d = self.decay_fn(self.updates)
+            msd = dist_utils.de_parallel(model).state_dict()
+            for k, v in self.module.state_dict().items():
+                if v.dtype.is_floating_point:
+                    v *= d
+                    v += (1 - d) * msd[k].detach()
+
+    def to(self, *args, **kwargs):
+        self.module = self.module.to(*args, **kwargs)
+        return self
+
+    def state_dict(self, ):
+        return dict(module=self.module.state_dict(), updates=self.updates)
+
+    def load_state_dict(self, state, strict=True):
+        self.module.load_state_dict(state['module'], strict=strict)
+        if 'updates' in state:
+            self.updates = state['updates']
+
+    def forwad(self, ):
+        raise RuntimeError('ema...')
+
+    def extra_repr(self) -> str:
+        return f'decay={self.decay}, warmups={self.warmups}'
+
+
+
+class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
+    """Maintains moving averages of model parameters using an exponential decay.
+    ``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
+    `torch.optim.swa_utils.AveragedModel <https://pytorch.org/docs/stable/optim.html#custom-averaging-strategies>`_
+    is used to compute the EMA.
+    """
+    def __init__(self, model, decay, device="cpu", use_buffers=True):
+
+        self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000))
+
+        def ema_avg(avg_model_param, model_param, num_averaged):
+            decay = self.decay_fn(num_averaged)
+            return decay * avg_model_param + (1 - decay) * model_param
+
+        super().__init__(model, device, ema_avg, use_buffers=use_buffers)
diff --git a/deim/_engine/optim/lr_scheduler.py b/deim/_engine/optim/lr_scheduler.py
new file mode 100644
index 00000000..b5902b34
--- /dev/null
+++ b/deim/_engine/optim/lr_scheduler.py
@@ -0,0 +1,73 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+"""
+
+import math
+from functools import partial
+
+
+def flat_cosine_schedule(total_iter, warmup_iter, flat_iter, no_aug_iter, current_iter, init_lr, min_lr):
+    """
+    Computes the learning rate using a warm-up, flat, and cosine decay schedule.
+
+    Args:
+        total_iter (int): Total number of iterations.
+        warmup_iter (int): Number of iterations for warm-up phase.
+        flat_iter (int): Number of iterations for flat phase.
+        no_aug_iter (int): Number of iterations for no-augmentation phase.
+        current_iter (int): Current iteration.
+        init_lr (float): Initial learning rate.
+        min_lr (float): Minimum learning rate.
+
+    Returns:
+        float: Calculated learning rate.
+    """
+    if current_iter <= warmup_iter:
+        return init_lr * (current_iter / float(warmup_iter)) ** 2
+    elif warmup_iter < current_iter <= flat_iter:
+        return init_lr
+    elif current_iter >= total_iter - no_aug_iter:
+        return min_lr
+    else:
+        cosine_decay = 0.5 * (1 + math.cos(math.pi * (current_iter - flat_iter) /
+                                           (total_iter - flat_iter - no_aug_iter)))
+        return min_lr + (init_lr - min_lr) * cosine_decay
+
+
+class FlatCosineLRScheduler:
+    """
+    Learning rate scheduler with warm-up, optional flat phase, and cosine decay following RTMDet.
+
+    Args:
+        optimizer (torch.optim.Optimizer): Optimizer instance.
+        lr_gamma (float): Scaling factor for the minimum learning rate.
+        iter_per_epoch (int): Number of iterations per epoch.
+        total_epochs (int): Total number of training epochs.
+        warmup_epochs (int): Number of warm-up epochs.
+        flat_epochs (int): Number of flat epochs (for flat-cosine scheduler).
+        no_aug_epochs (int): Number of no-augmentation epochs.
+    """
+    def __init__(self, optimizer, lr_gamma, iter_per_epoch, total_epochs, 
+                 warmup_iter, flat_epochs, no_aug_epochs, scheduler_type="cosine"):
+        self.base_lrs = [group["initial_lr"] for group in optimizer.param_groups]
+        self.min_lrs = [base_lr * lr_gamma for base_lr in self.base_lrs]
+
+        total_iter = int(iter_per_epoch * total_epochs)
+        no_aug_iter = int(iter_per_epoch * no_aug_epochs)
+        flat_iter = int(iter_per_epoch * flat_epochs)
+
+        print(self.base_lrs, self.min_lrs, total_iter, warmup_iter, flat_iter, no_aug_iter)
+        self.lr_func = partial(flat_cosine_schedule, total_iter, warmup_iter, flat_iter, no_aug_iter)
+
+    def step(self, current_iter, optimizer):
+        """
+        Updates the learning rate of the optimizer at the current iteration.
+
+        Args:
+            current_iter (int): Current iteration.
+            optimizer (torch.optim.Optimizer): Optimizer instance.
+        """
+        for i, group in enumerate(optimizer.param_groups):
+            group["lr"] = self.lr_func(current_iter, self.base_lrs[i], self.min_lrs[i])
+        return optimizer
diff --git a/deim/_engine/optim/optim.py b/deim/_engine/optim/optim.py
new file mode 100644
index 00000000..f4830c66
--- /dev/null
+++ b/deim/_engine/optim/optim.py
@@ -0,0 +1,25 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_scheduler
+
+from ..core import register
+
+
+__all__ = ['AdamW', 'SGD', 'Adam', 'MultiStepLR', 'CosineAnnealingLR', 'OneCycleLR', 'LambdaLR']
+
+
+
+SGD = register()(optim.SGD)
+Adam = register()(optim.Adam)
+AdamW = register()(optim.AdamW)
+
+
+MultiStepLR = register()(lr_scheduler.MultiStepLR)
+CosineAnnealingLR = register()(lr_scheduler.CosineAnnealingLR)
+OneCycleLR = register()(lr_scheduler.OneCycleLR)
+LambdaLR = register()(lr_scheduler.LambdaLR)
diff --git a/deim/_engine/optim/warmup.py b/deim/_engine/optim/warmup.py
new file mode 100644
index 00000000..86e319b5
--- /dev/null
+++ b/deim/_engine/optim/warmup.py
@@ -0,0 +1,48 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from torch.optim.lr_scheduler import LRScheduler
+
+from ..core import register
+
+
+class Warmup(object):
+    def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int=-1) -> None:
+        self.lr_scheduler = lr_scheduler
+        self.warmup_end_values = [pg['lr'] for pg in lr_scheduler.optimizer.param_groups]
+        self.last_step = last_step
+        self.warmup_duration = warmup_duration
+        self.step()
+
+    def state_dict(self):
+        return {k: v for k, v in self.__dict__.items() if k != 'lr_scheduler'}
+
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+
+    def get_warmup_factor(self, step, **kwargs):
+        raise NotImplementedError
+
+    def step(self, ):
+        self.last_step += 1
+        if self.last_step >= self.warmup_duration:
+            return
+        factor = self.get_warmup_factor(self.last_step)
+        for i, pg in enumerate(self.lr_scheduler.optimizer.param_groups):
+            pg['lr'] = factor * self.warmup_end_values[i]
+
+    def finished(self, ):
+        if self.last_step >= self.warmup_duration:
+            return True
+        return False
+
+
+@register()
+class LinearWarmup(Warmup):
+    def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int = -1) -> None:
+        super().__init__(lr_scheduler, warmup_duration, last_step)
+
+    def get_warmup_factor(self, step):
+        return min(1.0, (step + 1) / self.warmup_duration)
diff --git a/deim/_engine/solver/__init__.py b/deim/_engine/solver/__init__.py
new file mode 100644
index 00000000..a6a56c9f
--- /dev/null
+++ b/deim/_engine/solver/__init__.py
@@ -0,0 +1,17 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from ._solver import BaseSolver
+from .clas_solver import ClasSolver
+from .det_solver import DetSolver
+
+
+
+from typing import Dict
+
+TASKS :Dict[str, BaseSolver] = {
+    'classification': ClasSolver,
+    'detection': DetSolver,
+}
diff --git a/deim/_engine/solver/_solver.py b/deim/_engine/solver/_solver.py
new file mode 100644
index 00000000..925e45f0
--- /dev/null
+++ b/deim/_engine/solver/_solver.py
@@ -0,0 +1,696 @@
+import torch
+import torch.nn as nn
+
+from datetime import datetime
+from pathlib import Path
+from typing import Dict
+import atexit
+
+from ..misc import dist_utils
+from ..core import BaseConfig
+
+
+def to(m: nn.Module, device: str):
+    if m is None:
+        return None
+    return m.to(device)
+
+
+def remove_module_prefix(state_dict):
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if k.startswith('module.'):
+            new_state_dict[k[7:]] = v
+        else:
+            new_state_dict[k] = v
+    return new_state_dict
+
+
+class BaseSolver(object):
+    def __init__(self, cfg: BaseConfig) -> None:
+        self.cfg = cfg
+        self.obj365_ids = [
+            0, 46, 5, 58, 114, 55, 116, 65, 21, 40, 176, 127, 249, 24, 56, 139, 92, 78, 99, 96,
+            144, 295, 178, 180, 38, 39, 13, 43, 120, 219, 148, 173, 165, 154, 137, 113, 145, 146,
+            204, 8, 35, 10, 88, 84, 93, 26, 112, 82, 265, 104, 141, 152, 234, 143, 150, 97, 2,
+            50, 25, 75, 98, 153, 37, 73, 115, 132, 106, 61, 163, 134, 277, 81, 133, 18, 94, 30,
+            169, 70, 328, 226
+        ]
+    def _setup(self):
+        """Avoid instantiating unnecessary classes"""
+        cfg = self.cfg
+        if cfg.device:
+            device = torch.device(cfg.device)
+        else:
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+        self.model = cfg.model
+
+        # NOTE: Must load_tuning_state before EMA instance building
+        if self.cfg.tuning:
+            print(f'Tuning checkpoint from {self.cfg.tuning}')
+            self.load_tuning_state(self.cfg.tuning)
+
+        self.model = dist_utils.warp_model(
+            self.model.to(device), sync_bn=cfg.sync_bn, find_unused_parameters=cfg.find_unused_parameters
+        )
+
+        self.criterion = self.to(cfg.criterion, device)
+        self.postprocessor = self.to(cfg.postprocessor, device)
+
+        self.ema = self.to(cfg.ema, device)
+        self.scaler = cfg.scaler
+
+        self.device = device
+        self.last_epoch = self.cfg.last_epoch
+
+        self.output_dir = Path(cfg.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.writer = cfg.writer
+
+        if self.writer:
+            atexit.register(self.writer.close)
+            if dist_utils.is_main_process():
+                self.writer.add_text('config', '{:s}'.format(cfg.__repr__()), 0)
+
+    def cleanup(self):
+        if self.writer:
+            atexit.register(self.writer.close)
+
+    def train(self):
+        self._setup()
+        self.optimizer = self.cfg.optimizer
+        self.lr_scheduler = self.cfg.lr_scheduler
+        self.lr_warmup_scheduler = self.cfg.lr_warmup_scheduler
+
+        self.train_dataloader = dist_utils.warp_loader(
+            self.cfg.train_dataloader, shuffle=self.cfg.train_dataloader.shuffle
+        )
+        self.val_dataloader = dist_utils.warp_loader(
+            self.cfg.val_dataloader, shuffle=self.cfg.val_dataloader.shuffle
+        )
+
+        self.evaluator = self.cfg.evaluator
+
+        # NOTE: Instantiating order
+        if self.cfg.resume:
+            print(f'Resume checkpoint from {self.cfg.resume}')
+            self.load_resume_state(self.cfg.resume)
+
+    def eval(self):
+        self._setup()
+
+        self.val_dataloader = dist_utils.warp_loader(
+            self.cfg.val_dataloader, shuffle=self.cfg.val_dataloader.shuffle
+        )
+
+        self.evaluator = self.cfg.evaluator
+
+        if self.cfg.resume:
+            print(f'Resume checkpoint from {self.cfg.resume}')
+            self.load_resume_state(self.cfg.resume)
+
+    def to(self, module, device):
+        return module.to(device) if hasattr(module, 'to') else module
+
+    def state_dict(self):
+        """State dict, train/eval"""
+        state = {}
+        state['date'] = datetime.now().isoformat()
+
+        # For resume
+        state['last_epoch'] = self.last_epoch
+
+        for k, v in self.__dict__.items():
+            if hasattr(v, 'state_dict'):
+                v = dist_utils.de_parallel(v)
+                state[k] = v.state_dict()
+
+        return state
+
+    def load_state_dict(self, state):
+        """Load state dict, train/eval"""
+        if 'last_epoch' in state:
+            self.last_epoch = state['last_epoch']
+            print('Load last_epoch')
+
+        for k, v in self.__dict__.items():
+            if hasattr(v, 'load_state_dict') and k in state:
+                v = dist_utils.de_parallel(v)
+                v.load_state_dict(state[k])
+                print(f'Load {k}.state_dict')
+
+            if hasattr(v, 'load_state_dict') and k not in state:
+                if k == 'ema':
+                    model = getattr(self, 'model', None)
+                    if model is not None:
+                        ema = dist_utils.de_parallel(v)
+                        model_state_dict = remove_module_prefix(model.state_dict())
+                        ema.load_state_dict({'module': model_state_dict})
+                        print(f'Load {k}.state_dict from model.state_dict')
+                else:
+                    print(f'Not load {k}.state_dict')
+
+    def load_resume_state(self, path: str):
+        """Load resume"""
+        if path.startswith('http'):
+            state = torch.hub.load_state_dict_from_url(path, map_location='cpu')
+        else:
+            state = torch.load(path, map_location='cpu')
+
+        # state['model'] = remove_module_prefix(state['model'])
+        self.load_state_dict(state)
+
+    def load_tuning_state(self, path: str):
+        """Load model for tuning and adjust mismatched head parameters"""
+        if path.startswith('http'):
+            state = torch.hub.load_state_dict_from_url(path, map_location='cpu')
+        else:
+            state = torch.load(path, map_location='cpu')
+
+        module = dist_utils.de_parallel(self.model)
+
+        # Load the appropriate state dict
+        if 'ema' in state:
+            pretrain_state_dict = state['ema']['module']
+        else:
+            pretrain_state_dict = state['model']
+
+        # Adjust head parameters between datasets
+        try:
+            adjusted_state_dict = self._adjust_head_parameters(module.state_dict(), pretrain_state_dict)
+            stat, infos = self._matched_state(module.state_dict(), adjusted_state_dict)
+        except Exception:
+            stat, infos = self._matched_state(module.state_dict(), pretrain_state_dict)
+
+        module.load_state_dict(stat, strict=False)
+        print(f'Load model.state_dict, {infos}')
+
+    @staticmethod
+    def _matched_state(state: Dict[str, torch.Tensor], params: Dict[str, torch.Tensor]):
+        missed_list = []
+        unmatched_list = []
+        matched_state = {}
+        for k, v in state.items():
+            if k in params:
+                if v.shape == params[k].shape:
+                    matched_state[k] = params[k]
+                else:
+                    unmatched_list.append(k)
+            else:
+                missed_list.append(k)
+
+        return matched_state, {'missed': missed_list, 'unmatched': unmatched_list}
+
+    def _adjust_head_parameters(self, cur_state_dict, pretrain_state_dict):
+        """Adjust head parameters between datasets."""
+        # List of parameters to adjust
+        if pretrain_state_dict['decoder.denoising_class_embed.weight'].size() != \
+                cur_state_dict['decoder.denoising_class_embed.weight'].size():
+            del pretrain_state_dict['decoder.denoising_class_embed.weight']
+
+        head_param_names = [
+            'decoder.enc_score_head.weight',
+            'decoder.enc_score_head.bias'
+        ]
+        for i in range(8):
+            head_param_names.append(f'decoder.dec_score_head.{i}.weight')
+            head_param_names.append(f'decoder.dec_score_head.{i}.bias')
+
+        adjusted_params = []
+
+        for param_name in head_param_names:
+            if param_name in cur_state_dict and param_name in pretrain_state_dict:
+                cur_tensor = cur_state_dict[param_name]
+                pretrain_tensor = pretrain_state_dict[param_name]
+                adjusted_tensor = self.map_class_weights(cur_tensor, pretrain_tensor)
+                if adjusted_tensor is not None:
+                    pretrain_state_dict[param_name] = adjusted_tensor
+                    adjusted_params.append(param_name)
+                else:
+                    print(f"Cannot adjust parameter '{param_name}' due to size mismatch.")
+
+        return pretrain_state_dict
+
+    def map_class_weights(self, cur_tensor, pretrain_tensor):
+        """Map class weights from pretrain model to current model based on class IDs."""
+        if pretrain_tensor.size() == cur_tensor.size():
+            return pretrain_tensor
+
+        adjusted_tensor = cur_tensor.clone()
+        adjusted_tensor.requires_grad = False
+
+        if pretrain_tensor.size() > cur_tensor.size():
+            for coco_id, obj_id in enumerate(self.obj365_ids):
+                adjusted_tensor[coco_id] = pretrain_tensor[obj_id+1]
+        else:
+            for coco_id, obj_id in enumerate(self.obj365_ids):
+                adjusted_tensor[obj_id+1] = pretrain_tensor[coco_id]
+
+        return adjusted_tensor
+
+    def fit(self):
+        raise NotImplementedError('')
+
+    def val(self):
+        raise NotImplementedError('')
+
+# obj365_classes = [
+#         'Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp', 'Glasses',
+#         'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf', 'Handbag/Satchel',
+#         'Bracelet', 'Plate', 'Picture/Frame', 'Helmet', 'Book', 'Gloves', 'Storage box',
+#         'Boat', 'Leather Shoes', 'Flower', 'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag',
+#         'Pillow', 'Boots', 'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass',
+#         'Belt', 'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker', 'Watch',
+#         'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool', 'Barrel/bucket', 'Van',
+#         'Couch', 'Sandals', 'Bakset', 'Drum', 'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels',
+#         'Motorcycle', 'Guitar', 'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned', 'Truck',
+#         'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel', 'Stuffed Toy', 'Candle', 'Sailboat',
+#         'Laptop', 'Awning', 'Bed', 'Faucet', 'Tent', 'Horse', 'Mirror', 'Power outlet',
+#         'Sink', 'Apple', 'Air Conditioner', 'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck',
+#         'Fork', 'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon', 'Clock', 'Pot', 'Cow',
+#         'Cake', 'Dinning Table', 'Sheep', 'Hanger', 'Blackboard/Whiteboard', 'Napkin',
+#         'Other Fish', 'Orange/Tangerine', 'Toiletry', 'Keyboard', 'Tomato', 'Lantern',
+#         'Machinery Vehicle', 'Fan', 'Green Vegetables', 'Banana', 'Baseball Glove',
+#         'Airplane', 'Mouse', 'Train', 'Pumpkin', 'Soccer', 'Skiboard', 'Luggage', 'Nightstand',
+#         'Tea pot', 'Telephone', 'Trolley', 'Head Phone', 'Sports Car', 'Stop Sign', 'Dessert',
+#         'Scooter', 'Stroller', 'Crane', 'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck',
+#         'Baseball Bat', 'Surveillance Camera', 'Cat', 'Jug', 'Broccoli', 'Piano', 'Pizza',
+#         'Elephant', 'Skateboard', 'Surfboard', 'Gun', 'Skating and Skiing shoes', 'Gas stove',
+#         'Donut', 'Bow Tie', 'Carrot', 'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel',
+#         'Pepper', 'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks', 'Microwave',
+#         'Pigeon', 'Baseball', 'Cutting/chopping Board', 'Coffee Table', 'Side Table', 'Scissors',
+#         'Marker', 'Pie', 'Ladder', 'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball',
+#         'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle', 'Violin', 'Egg',
+#         'Fire Extinguisher', 'Candy', 'Fire Truck', 'Billards', 'Converter', 'Bathtub',
+#         'Wheelchair', 'Golf Club', 'Briefcase', 'Cucumber', 'Cigar/Cigarette ', 'Paint Brush',
+#         'Pear', 'Heavy Truck', 'Hamburger', 'Extractor', 'Extention Cord', 'Tong',
+#         'Tennis Racket', 'Folder', 'American Football', 'earphone', 'Mask', 'Kettle',
+#         'Tennis', 'Ship', 'Swing', 'Coffee Machine', 'Slide', 'Carriage', 'Onion',
+#         'Green beans', 'Projector', 'Frisbee', 'Washing Machine/Drying Machine', 'Chicken',
+#         'Printer', 'Watermelon', 'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream',
+#         'Hotair ballon', 'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog',
+#         'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer', 'Goose', 'Tape',
+#         'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple', 'Golf Ball', 'Ambulance', 'Parking meter',
+#         'Mango', 'Key', 'Hurdle', 'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin',
+#         'Megaphone', 'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion',
+#         'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom', 'Trombone',
+#         'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit', 'Router/modem', 'Poker Card', 'Toaster',
+#         'Shrimp', 'Sushi', 'Cheese', 'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer',
+#         'Cue', 'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap', 'Recorder',
+#         'Bear', 'Eggplant', 'Board Eraser', 'Coconut', 'Tape Measur/ Ruler', 'Pig',
+#         'Showerhead', 'Globe', 'Chips', 'Steak', 'Crosswalk Sign', 'Stapler', 'Campel',
+#         'Formula 1 ', 'Pomegranate', 'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball',
+#         'Rice Cooker', 'Tuba', 'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal',
+#         'Buttefly', 'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin', 'Electric Drill',
+#         'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill', 'Lighter', 'Grapefruit',
+#         'Game board', 'Mop', 'Radish', 'Baozi', 'Target', 'French', 'Spring Rolls', 'Monkey',
+#         'Rabbit', 'Pencil Case', 'Yak', 'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell',
+#         'Scallop', 'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle',
+#         'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster', 'Durian', 'Okra',
+#         'Lipstick', 'Cosmetics Mirror', 'Curling', 'Table Tennis '
+# ]
+
+# coco_classes = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+#                'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+#                'stop sign', 'parking meter', 'bench', 'wild bird', 'cat', 'dog',
+#                'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+#                'backpack', 'umbrella', 'handbag/satchel', 'tie', 'luggage', 'frisbee',
+#                'skating and skiing shoes', 'snowboard', 'baseball', 'kite', 'baseball bat',
+#                'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+#                'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl/basin',
+#                'banana', 'apple', 'sandwich', 'orange/tangerine', 'broccoli', 'carrot',
+#                'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+#                'potted plant', 'bed', 'dinning table', 'toilet', 'moniter/tv', 'laptop',
+#                'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+#                'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
+#                'vase', 'scissors', 'stuffed toy', 'hair dryer', 'toothbrush']
+
+
+# obj365_classes = [
+#     (0, 'Person'),
+#     (1, 'Sneakers'),
+#     (2, 'Chair'),
+#     (3, 'Other Shoes'),
+#     (4, 'Hat'),
+#     (5, 'Car'),
+#     (6, 'Lamp'),
+#     (7, 'Glasses'),
+#     (8, 'Bottle'),
+#     (9, 'Desk'),
+#     (10, 'Cup'),
+#     (11, 'Street Lights'),
+#     (12, 'Cabinet/shelf'),
+#     (13, 'Handbag/Satchel'),
+#     (14, 'Bracelet'),
+#     (15, 'Plate'),
+#     (16, 'Picture/Frame'),
+#     (17, 'Helmet'),
+#     (18, 'Book'),
+#     (19, 'Gloves'),
+#     (20, 'Storage box'),
+#     (21, 'Boat'),
+#     (22, 'Leather Shoes'),
+#     (23, 'Flower'),
+#     (24, 'Bench'),
+#     (25, 'Potted Plant'),
+#     (26, 'Bowl/Basin'),
+#     (27, 'Flag'),
+#     (28, 'Pillow'),
+#     (29, 'Boots'),
+#     (30, 'Vase'),
+#     (31, 'Microphone'),
+#     (32, 'Necklace'),
+#     (33, 'Ring'),
+#     (34, 'SUV'),
+#     (35, 'Wine Glass'),
+#     (36, 'Belt'),
+#     (37, 'Monitor/TV'),
+#     (38, 'Backpack'),
+#     (39, 'Umbrella'),
+#     (40, 'Traffic Light'),
+#     (41, 'Speaker'),
+#     (42, 'Watch'),
+#     (43, 'Tie'),
+#     (44, 'Trash bin Can'),
+#     (45, 'Slippers'),
+#     (46, 'Bicycle'),
+#     (47, 'Stool'),
+#     (48, 'Barrel/bucket'),
+#     (49, 'Van'),
+#     (50, 'Couch'),
+#     (51, 'Sandals'),
+#     (52, 'Basket'),
+#     (53, 'Drum'),
+#     (54, 'Pen/Pencil'),
+#     (55, 'Bus'),
+#     (56, 'Wild Bird'),
+#     (57, 'High Heels'),
+#     (58, 'Motorcycle'),
+#     (59, 'Guitar'),
+#     (60, 'Carpet'),
+#     (61, 'Cell Phone'),
+#     (62, 'Bread'),
+#     (63, 'Camera'),
+#     (64, 'Canned'),
+#     (65, 'Truck'),
+#     (66, 'Traffic cone'),
+#     (67, 'Cymbal'),
+#     (68, 'Lifesaver'),
+#     (69, 'Towel'),
+#     (70, 'Stuffed Toy'),
+#     (71, 'Candle'),
+#     (72, 'Sailboat'),
+#     (73, 'Laptop'),
+#     (74, 'Awning'),
+#     (75, 'Bed'),
+#     (76, 'Faucet'),
+#     (77, 'Tent'),
+#     (78, 'Horse'),
+#     (79, 'Mirror'),
+#     (80, 'Power outlet'),
+#     (81, 'Sink'),
+#     (82, 'Apple'),
+#     (83, 'Air Conditioner'),
+#     (84, 'Knife'),
+#     (85, 'Hockey Stick'),
+#     (86, 'Paddle'),
+#     (87, 'Pickup Truck'),
+#     (88, 'Fork'),
+#     (89, 'Traffic Sign'),
+#     (90, 'Balloon'),
+#     (91, 'Tripod'),
+#     (92, 'Dog'),
+#     (93, 'Spoon'),
+#     (94, 'Clock'),
+#     (95, 'Pot'),
+#     (96, 'Cow'),
+#     (97, 'Cake'),
+#     (98, 'Dining Table'),
+#     (99, 'Sheep'),
+#     (100, 'Hanger'),
+#     (101, 'Blackboard/Whiteboard'),
+#     (102, 'Napkin'),
+#     (103, 'Other Fish'),
+#     (104, 'Orange/Tangerine'),
+#     (105, 'Toiletry'),
+#     (106, 'Keyboard'),
+#     (107, 'Tomato'),
+#     (108, 'Lantern'),
+#     (109, 'Machinery Vehicle'),
+#     (110, 'Fan'),
+#     (111, 'Green Vegetables'),
+#     (112, 'Banana'),
+#     (113, 'Baseball Glove'),
+#     (114, 'Airplane'),
+#     (115, 'Mouse'),
+#     (116, 'Train'),
+#     (117, 'Pumpkin'),
+#     (118, 'Soccer'),
+#     (119, 'Skiboard'),
+#     (120, 'Luggage'),
+#     (121, 'Nightstand'),
+#     (122, 'Tea pot'),
+#     (123, 'Telephone'),
+#     (124, 'Trolley'),
+#     (125, 'Head Phone'),
+#     (126, 'Sports Car'),
+#     (127, 'Stop Sign'),
+#     (128, 'Dessert'),
+#     (129, 'Scooter'),
+#     (130, 'Stroller'),
+#     (131, 'Crane'),
+#     (132, 'Remote'),
+#     (133, 'Refrigerator'),
+#     (134, 'Oven'),
+#     (135, 'Lemon'),
+#     (136, 'Duck'),
+#     (137, 'Baseball Bat'),
+#     (138, 'Surveillance Camera'),
+#     (139, 'Cat'),
+#     (140, 'Jug'),
+#     (141, 'Broccoli'),
+#     (142, 'Piano'),
+#     (143, 'Pizza'),
+#     (144, 'Elephant'),
+#     (145, 'Skateboard'),
+#     (146, 'Surfboard'),
+#     (147, 'Gun'),
+#     (148, 'Skating and Skiing Shoes'),
+#     (149, 'Gas Stove'),
+#     (150, 'Donut'),
+#     (151, 'Bow Tie'),
+#     (152, 'Carrot'),
+#     (153, 'Toilet'),
+#     (154, 'Kite'),
+#     (155, 'Strawberry'),
+#     (156, 'Other Balls'),
+#     (157, 'Shovel'),
+#     (158, 'Pepper'),
+#     (159, 'Computer Box'),
+#     (160, 'Toilet Paper'),
+#     (161, 'Cleaning Products'),
+#     (162, 'Chopsticks'),
+#     (163, 'Microwave'),
+#     (164, 'Pigeon'),
+#     (165, 'Baseball'),
+#     (166, 'Cutting/chopping Board'),
+#     (167, 'Coffee Table'),
+#     (168, 'Side Table'),
+#     (169, 'Scissors'),
+#     (170, 'Marker'),
+#     (171, 'Pie'),
+#     (172, 'Ladder'),
+#     (173, 'Snowboard'),
+#     (174, 'Cookies'),
+#     (175, 'Radiator'),
+#     (176, 'Fire Hydrant'),
+#     (177, 'Basketball'),
+#     (178, 'Zebra'),
+#     (179, 'Grape'),
+#     (180, 'Giraffe'),
+#     (181, 'Potato'),
+#     (182, 'Sausage'),
+#     (183, 'Tricycle'),
+#     (184, 'Violin'),
+#     (185, 'Egg'),
+#     (186, 'Fire Extinguisher'),
+#     (187, 'Candy'),
+#     (188, 'Fire Truck'),
+#     (189, 'Billiards'),
+#     (190, 'Converter'),
+#     (191, 'Bathtub'),
+#     (192, 'Wheelchair'),
+#     (193, 'Golf Club'),
+#     (194, 'Briefcase'),
+#     (195, 'Cucumber'),
+#     (196, 'Cigar/Cigarette'),
+#     (197, 'Paint Brush'),
+#     (198, 'Pear'),
+#     (199, 'Heavy Truck'),
+#     (200, 'Hamburger'),
+#     (201, 'Extractor'),
+#     (202, 'Extension Cord'),
+#     (203, 'Tong'),
+#     (204, 'Tennis Racket'),
+#     (205, 'Folder'),
+#     (206, 'American Football'),
+#     (207, 'Earphone'),
+#     (208, 'Mask'),
+#     (209, 'Kettle'),
+#     (210, 'Tennis'),
+#     (211, 'Ship'),
+#     (212, 'Swing'),
+#     (213, 'Coffee Machine'),
+#     (214, 'Slide'),
+#     (215, 'Carriage'),
+#     (216, 'Onion'),
+#     (217, 'Green Beans'),
+#     (218, 'Projector'),
+#     (219, 'Frisbee'),
+#     (220, 'Washing Machine/Drying Machine'),
+#     (221, 'Chicken'),
+#     (222, 'Printer'),
+#     (223, 'Watermelon'),
+#     (224, 'Saxophone'),
+#     (225, 'Tissue'),
+#     (226, 'Toothbrush'),
+#     (227, 'Ice Cream'),
+#     (228, 'Hot Air Balloon'),
+#     (229, 'Cello'),
+#     (230, 'French Fries'),
+#     (231, 'Scale'),
+#     (232, 'Trophy'),
+#     (233, 'Cabbage'),
+#     (234, 'Hot Dog'),
+#     (235, 'Blender'),
+#     (236, 'Peach'),
+#     (237, 'Rice'),
+#     (238, 'Wallet/Purse'),
+#     (239, 'Volleyball'),
+#     (240, 'Deer'),
+#     (241, 'Goose'),
+#     (242, 'Tape'),
+#     (243, 'Tablet'),
+#     (244, 'Cosmetics'),
+#     (245, 'Trumpet'),
+#     (246, 'Pineapple'),
+#     (247, 'Golf Ball'),
+#     (248, 'Ambulance'),
+#     (249, 'Parking Meter'),
+#     (250, 'Mango'),
+#     (251, 'Key'),
+#     (252, 'Hurdle'),
+#     (253, 'Fishing Rod'),
+#     (254, 'Medal'),
+#     (255, 'Flute'),
+#     (256, 'Brush'),
+#     (257, 'Penguin'),
+#     (258, 'Megaphone'),
+#     (259, 'Corn'),
+#     (260, 'Lettuce'),
+#     (261, 'Garlic'),
+#     (262, 'Swan'),
+#     (263, 'Helicopter'),
+#     (264, 'Green Onion'),
+#     (265, 'Sandwich'),
+#     (266, 'Nuts'),
+#     (267, 'Speed Limit Sign'),
+#     (268, 'Induction Cooker'),
+#     (269, 'Broom'),
+#     (270, 'Trombone'),
+#     (271, 'Plum'),
+#     (272, 'Rickshaw'),
+#     (273, 'Goldfish'),
+#     (274, 'Kiwi Fruit'),
+#     (275, 'Router/Modem'),
+#     (276, 'Poker Card'),
+#     (277, 'Toaster'),
+#     (278, 'Shrimp'),
+#     (279, 'Sushi'),
+#     (280, 'Cheese'),
+#     (281, 'Notepaper'),
+#     (282, 'Cherry'),
+#     (283, 'Pliers'),
+#     (284, 'CD'),
+#     (285, 'Pasta'),
+#     (286, 'Hammer'),
+#     (287, 'Cue'),
+#     (288, 'Avocado'),
+#     (289, 'Hami Melon'),
+#     (290, 'Flask'),
+#     (291, 'Mushroom'),
+#     (292, 'Screwdriver'),
+#     (293, 'Soap'),
+#     (294, 'Recorder'),
+#     (295, 'Bear'),
+#     (296, 'Eggplant'),
+#     (297, 'Board Eraser'),
+#     (298, 'Coconut'),
+#     (299, 'Tape Measure/Ruler'),
+#     (300, 'Pig'),
+#     (301, 'Showerhead'),
+#     (302, 'Globe'),
+#     (303, 'Chips'),
+#     (304, 'Steak'),
+#     (305, 'Crosswalk Sign'),
+#     (306, 'Stapler'),
+#     (307, 'Camel'),
+#     (308, 'Formula 1'),
+#     (309, 'Pomegranate'),
+#     (310, 'Dishwasher'),
+#     (311, 'Crab'),
+#     (312, 'Hoverboard'),
+#     (313, 'Meatball'),
+#     (314, 'Rice Cooker'),
+#     (315, 'Tuba'),
+#     (316, 'Calculator'),
+#     (317, 'Papaya'),
+#     (318, 'Antelope'),
+#     (319, 'Parrot'),
+#     (320, 'Seal'),
+#     (321, 'Butterfly'),
+#     (322, 'Dumbbell'),
+#     (323, 'Donkey'),
+#     (324, 'Lion'),
+#     (325, 'Urinal'),
+#     (326, 'Dolphin'),
+#     (327, 'Electric Drill'),
+#     (328, 'Hair Dryer'),
+#     (329, 'Egg Tart'),
+#     (330, 'Jellyfish'),
+#     (331, 'Treadmill'),
+#     (332, 'Lighter'),
+#     (333, 'Grapefruit'),
+#     (334, 'Game Board'),
+#     (335, 'Mop'),
+#     (336, 'Radish'),
+#     (337, 'Baozi'),
+#     (338, 'Target'),
+#     (339, 'French'),
+#     (340, 'Spring Rolls'),
+#     (341, 'Monkey'),
+#     (342, 'Rabbit'),
+#     (343, 'Pencil Case'),
+#     (344, 'Yak'),
+#     (345, 'Red Cabbage'),
+#     (346, 'Binoculars'),
+#     (347, 'Asparagus'),
+#     (348, 'Barbell'),
+#     (349, 'Scallop'),
+#     (350, 'Noodles'),
+#     (351, 'Comb'),
+#     (352, 'Dumpling'),
+#     (353, 'Oyster'),
+#     (354, 'Table Tennis Paddle'),
+#     (355, 'Cosmetics Brush/Eyeliner Pencil'),
+#     (356, 'Chainsaw'),
+#     (357, 'Eraser'),
+#     (358, 'Lobster'),
+#     (359, 'Durian'),
+#     (360, 'Okra'),
+#     (361, 'Lipstick'),
+#     (362, 'Cosmetics Mirror'),
+#     (363, 'Curling'),
+#     (364, 'Table Tennis')
+# ]
diff --git a/deim/_engine/solver/clas_engine.py b/deim/_engine/solver/clas_engine.py
new file mode 100644
index 00000000..dee29b57
--- /dev/null
+++ b/deim/_engine/solver/clas_engine.py
@@ -0,0 +1,74 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+
+from ..misc import (MetricLogger, SmoothedValue, reduce_dict)
+
+
+def train_one_epoch(model: nn.Module, criterion: nn.Module, dataloader, optimizer, ema, epoch, device):
+    """
+    """
+    model.train()
+
+    metric_logger = MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    print_freq = 100
+    header = 'Epoch: [{}]'.format(epoch)
+
+    for imgs, labels in metric_logger.log_every(dataloader, print_freq, header):
+        imgs = imgs.to(device)
+        labels = labels.to(device)
+
+        preds = model(imgs)
+        loss: torch.Tensor = criterion(preds, labels, epoch)
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        if ema is not None:
+            ema.update(model)
+
+        loss_reduced_values = {k: v.item() for k, v in reduce_dict({'loss': loss}).items()}
+        metric_logger.update(**loss_reduced_values)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    return stats
+
+
+
+@torch.no_grad()
+def evaluate(model, criterion, dataloader, device):
+    model.eval()
+
+    metric_logger = MetricLogger(delimiter="  ")
+    # metric_logger.add_meter('acc', SmoothedValue(window_size=1, fmt='{global_avg:.4f}'))
+    # metric_logger.add_meter('loss', SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    metric_logger.add_meter('acc', SmoothedValue(window_size=1))
+    metric_logger.add_meter('loss', SmoothedValue(window_size=1))
+
+    header = 'Test:'
+    for imgs, labels in metric_logger.log_every(dataloader, 10, header):
+        imgs, labels = imgs.to(device), labels.to(device)
+        preds = model(imgs)
+
+        acc = (preds.argmax(dim=-1) == labels).sum() / preds.shape[0]
+        loss = criterion(preds, labels)
+
+        dict_reduced = reduce_dict({'acc': acc, 'loss': loss})
+        reduced_values = {k: v.item() for k, v in dict_reduced.items()}
+        metric_logger.update(**reduced_values)
+
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    return stats
diff --git a/deim/_engine/solver/clas_solver.py b/deim/_engine/solver/clas_solver.py
new file mode 100644
index 00000000..e4990719
--- /dev/null
+++ b/deim/_engine/solver/clas_solver.py
@@ -0,0 +1,71 @@
+"""
+Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import time
+import json
+import datetime
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+
+from ..misc import dist_utils
+from ._solver import BaseSolver
+from .clas_engine import train_one_epoch, evaluate
+
+
+class ClasSolver(BaseSolver):
+
+    def fit(self, ):
+        print("Start training")
+        self.train()
+        args = self.cfg
+
+        n_parameters = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
+        print('Number of params:', n_parameters)
+
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(exist_ok=True)
+
+        start_time = time.time()
+        start_epoch = self.last_epoch + 1
+        for epoch in range(start_epoch, args.epoches):
+
+            if dist_utils.is_dist_available_and_initialized():
+                self.train_dataloader.sampler.set_epoch(epoch)
+
+            train_stats = train_one_epoch(self.model,
+                                        self.criterion,
+                                        self.train_dataloader,
+                                        self.optimizer,
+                                        self.ema,
+                                        epoch=epoch,
+                                        device=self.device)
+            self.lr_scheduler.step()
+            self.last_epoch += 1
+
+            if output_dir:
+                checkpoint_paths = [output_dir / 'checkpoint.pth']
+                # extra checkpoint before LR drop and every 100 epochs
+                if (epoch + 1) % args.checkpoint_freq == 0:
+                    checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
+                for checkpoint_path in checkpoint_paths:
+                    dist_utils.save_on_master(self.state_dict(epoch), checkpoint_path)
+
+            module = self.ema.module if self.ema else self.model
+            test_stats = evaluate(module, self.criterion, self.val_dataloader, self.device)
+
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                         **{f'test_{k}': v for k, v in test_stats.items()},
+                         'epoch': epoch,
+                         'n_parameters': n_parameters}
+
+            if output_dir and dist_utils.is_main_process():
+                with (output_dir / "log.txt").open("a") as f:
+                    f.write(json.dumps(log_stats) + "\n")
+
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('Training time {}'.format(total_time_str))
diff --git a/deim/_engine/solver/det_engine.py b/deim/_engine/solver/det_engine.py
new file mode 100644
index 00000000..5d53dda7
--- /dev/null
+++ b/deim/_engine/solver/det_engine.py
@@ -0,0 +1,177 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from DETR (https://github.com/facebookresearch/detr/blob/main/engine.py)
+Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+
+
+import sys
+import math
+from typing import Iterable
+
+import torch
+import torch.amp
+from torch.utils.tensorboard import SummaryWriter
+from torch.cuda.amp.grad_scaler import GradScaler
+
+from ..optim import ModelEMA, Warmup
+from ..data import CocoEvaluator
+from ..misc import MetricLogger, SmoothedValue, dist_utils
+
+
+def train_one_epoch(self_lr_scheduler, lr_scheduler, model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, max_norm: float = 0, **kwargs):
+    model.train()
+    criterion.train()
+    metric_logger = MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+
+    print_freq = kwargs.get('print_freq', 10)
+    writer :SummaryWriter = kwargs.get('writer', None)
+
+    ema :ModelEMA = kwargs.get('ema', None)
+    scaler :GradScaler = kwargs.get('scaler', None)
+    lr_warmup_scheduler :Warmup = kwargs.get('lr_warmup_scheduler', None)
+
+    cur_iters = epoch * len(data_loader)
+
+    for i, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        global_step = epoch * len(data_loader) + i
+        metas = dict(epoch=epoch, step=i, global_step=global_step, epoch_step=len(data_loader))
+
+        if scaler is not None:
+            with torch.autocast(device_type=str(device), cache_enabled=True):
+                outputs = model(samples, targets=targets)
+
+            if torch.isnan(outputs['pred_boxes']).any() or torch.isinf(outputs['pred_boxes']).any():
+                print(outputs['pred_boxes'])
+                state = model.state_dict()
+                new_state = {}
+                for key, value in model.state_dict().items():
+                    # Replace 'module' with 'model' in each key
+                    new_key = key.replace('module.', '')
+                    # Add the updated key-value pair to the state dictionary
+                    state[new_key] = value
+                new_state['model'] = state
+                dist_utils.save_on_master(new_state, "./NaN.pth")
+
+            with torch.autocast(device_type=str(device), enabled=False):
+                loss_dict = criterion(outputs, targets, **metas)
+
+            loss = sum(loss_dict.values())
+            scaler.scale(loss).backward()
+
+            if max_norm > 0:
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+
+        else:
+            outputs = model(samples, targets=targets)
+            loss_dict = criterion(outputs, targets, **metas)
+
+            loss : torch.Tensor = sum(loss_dict.values())
+            optimizer.zero_grad()
+            loss.backward()
+
+            if max_norm > 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+
+            optimizer.step()
+
+        # ema
+        if ema is not None:
+            ema.update(model)
+
+        if self_lr_scheduler:
+            optimizer = lr_scheduler.step(cur_iters + i, optimizer)
+        else:
+            if lr_warmup_scheduler is not None:
+                lr_warmup_scheduler.step()
+
+        loss_dict_reduced = dist_utils.reduce_dict(loss_dict)
+        loss_value = sum(loss_dict_reduced.values())
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        metric_logger.update(loss=loss_value, **loss_dict_reduced)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+        if writer and dist_utils.is_main_process() and global_step % 10 == 0:
+            writer.add_scalar('Loss/total', loss_value.item(), global_step)
+            for j, pg in enumerate(optimizer.param_groups):
+                writer.add_scalar(f'Lr/pg_{j}', pg['lr'], global_step)
+            for k, v in loss_dict_reduced.items():
+                writer.add_scalar(f'Loss/{k}', v.item(), global_step)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessor, data_loader, coco_evaluator: CocoEvaluator, device):
+    model.eval()
+    criterion.eval()
+    coco_evaluator.cleanup()
+
+    metric_logger = MetricLogger(delimiter="  ")
+    # metric_logger.add_meter('class_error', SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    header = 'Test:'
+
+    # iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessor.keys())
+    iou_types = coco_evaluator.iou_types
+    # coco_evaluator = CocoEvaluator(base_ds, iou_types)
+    # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75]
+
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        outputs = model(samples)
+
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+
+        results = postprocessor(outputs, orig_target_sizes)
+
+        # if 'segm' in postprocessor.keys():
+        #     target_sizes = torch.stack([t["size"] for t in targets], dim=0)
+        #     results = postprocessor['segm'](results, outputs, orig_target_sizes, target_sizes)
+
+        res = {target['image_id'].item(): output for target, output in zip(targets, results)}
+        if coco_evaluator is not None:
+            coco_evaluator.update(res)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+
+    # accumulate predictions from all images
+    if coco_evaluator is not None:
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+
+    stats = {}
+    # stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    if coco_evaluator is not None:
+        if 'bbox' in iou_types:
+            stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
+        if 'segm' in iou_types:
+            stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist()
+
+    return stats, coco_evaluator
diff --git a/deim/_engine/solver/det_solver.py b/deim/_engine/solver/det_solver.py
new file mode 100644
index 00000000..b9abf222
--- /dev/null
+++ b/deim/_engine/solver/det_solver.py
@@ -0,0 +1,195 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2024 D-FINE authors. All Rights Reserved.
+"""
+
+import time
+import json
+import datetime
+
+import torch
+
+from ..misc import dist_utils, stats
+
+from ._solver import BaseSolver
+from .det_engine import train_one_epoch, evaluate
+from ..optim.lr_scheduler import FlatCosineLRScheduler
+
+
+class DetSolver(BaseSolver):
+
+    def fit(self, ):
+        self.train()
+        args = self.cfg
+
+        n_parameters, model_stats = stats(self.cfg)
+        print(model_stats)
+        print("-"*42 + "Start training" + "-"*43)
+
+        self.self_lr_scheduler = False
+        if args.lrsheduler is not None:
+            iter_per_epoch = len(self.train_dataloader)
+            print("     ## Using Self-defined Scheduler-{} ## ".format(args.lrsheduler))
+            self.lr_scheduler = FlatCosineLRScheduler(self.optimizer, args.lr_gamma, iter_per_epoch, total_epochs=args.epoches, 
+                                                warmup_iter=args.warmup_iter, flat_epochs=args.flat_epoch, no_aug_epochs=args.no_aug_epoch)
+            self.self_lr_scheduler = True
+        n_parameters = sum([p.numel() for p in self.model.parameters() if p.requires_grad])
+        print(f'number of trainable parameters: {n_parameters}')
+
+        top1 = 0
+        best_stat = {'epoch': -1, }
+        # evaluate again before resume training
+        if self.last_epoch > 0:
+            module = self.ema.module if self.ema else self.model
+            test_stats, coco_evaluator = evaluate(
+                module,
+                self.criterion,
+                self.postprocessor,
+                self.val_dataloader,
+                self.evaluator,
+                self.device
+            )
+            for k in test_stats:
+                best_stat['epoch'] = self.last_epoch
+                best_stat[k] = test_stats[k][0]
+                top1 = test_stats[k][0]
+                print(f'best_stat: {best_stat}')
+
+        best_stat_print = best_stat.copy()
+        start_time = time.time()
+        start_epoch = self.last_epoch + 1
+        for epoch in range(start_epoch, args.epoches):
+
+            self.train_dataloader.set_epoch(epoch)
+            # self.train_dataloader.dataset.set_epoch(epoch)
+            if dist_utils.is_dist_available_and_initialized():
+                self.train_dataloader.sampler.set_epoch(epoch)
+
+            if epoch == self.train_dataloader.collate_fn.stop_epoch:
+                self.load_resume_state(str(self.output_dir / 'best_stg1.pth'))
+                self.ema.decay = self.train_dataloader.collate_fn.ema_restart_decay
+                print(f'Refresh EMA at epoch {epoch} with decay {self.ema.decay}')
+
+            train_stats = train_one_epoch(
+                self.self_lr_scheduler,
+                self.lr_scheduler,
+                self.model, 
+                self.criterion, 
+                self.train_dataloader, 
+                self.optimizer, 
+                self.device, 
+                epoch, 
+                max_norm=args.clip_max_norm, 
+                print_freq=args.print_freq, 
+                ema=self.ema, 
+                scaler=self.scaler, 
+                lr_warmup_scheduler=self.lr_warmup_scheduler,
+                writer=self.writer
+            )
+
+            if not self.self_lr_scheduler:  # update by epoch 
+                if self.lr_warmup_scheduler is None or self.lr_warmup_scheduler.finished():
+                    self.lr_scheduler.step()
+
+            self.last_epoch += 1
+
+            if self.output_dir and epoch < self.train_dataloader.collate_fn.stop_epoch:
+                checkpoint_paths = [self.output_dir / 'last.pth']
+                # extra checkpoint before LR drop and every 100 epochs
+                if (epoch + 1) % args.checkpoint_freq == 0:
+                    checkpoint_paths.append(self.output_dir / f'checkpoint{epoch:04}.pth')
+                for checkpoint_path in checkpoint_paths:
+                    dist_utils.save_on_master(self.state_dict(), checkpoint_path)
+
+            module = self.ema.module if self.ema else self.model
+            test_stats, coco_evaluator = evaluate(
+                module,
+                self.criterion,
+                self.postprocessor,
+                self.val_dataloader,
+                self.evaluator,
+                self.device
+            )
+
+            # TODO
+            for k in test_stats:
+                if self.writer and dist_utils.is_main_process():
+                    for i, v in enumerate(test_stats[k]):
+                        self.writer.add_scalar(f'Test/{k}_{i}'.format(k), v, epoch)
+
+                if k in best_stat:
+                    best_stat['epoch'] = epoch if test_stats[k][0] > best_stat[k] else best_stat['epoch']
+                    best_stat[k] = max(best_stat[k], test_stats[k][0])
+                else:
+                    best_stat['epoch'] = epoch
+                    best_stat[k] = test_stats[k][0]
+
+                if best_stat[k] > top1:
+                    best_stat_print['epoch'] = epoch
+                    top1 = best_stat[k]
+                    if self.output_dir:
+                        if epoch >= self.train_dataloader.collate_fn.stop_epoch:
+                            dist_utils.save_on_master(self.state_dict(), self.output_dir / 'best_stg2.pth')
+                        else:
+                            dist_utils.save_on_master(self.state_dict(), self.output_dir / 'best_stg1.pth')
+
+                best_stat_print[k] = max(best_stat[k], top1)
+                print(f'best_stat: {best_stat_print}')  # global best
+
+                if best_stat['epoch'] == epoch and self.output_dir:
+                    if epoch >= self.train_dataloader.collate_fn.stop_epoch:
+                        if test_stats[k][0] > top1:
+                            top1 = test_stats[k][0]
+                            dist_utils.save_on_master(self.state_dict(), self.output_dir / 'best_stg2.pth')
+                    else:
+                        top1 = max(test_stats[k][0], top1)
+                        dist_utils.save_on_master(self.state_dict(), self.output_dir / 'best_stg1.pth')
+
+                elif epoch >= self.train_dataloader.collate_fn.stop_epoch:
+                    best_stat = {'epoch': -1, }
+                    self.ema.decay -= 0.0001
+                    self.load_resume_state(str(self.output_dir / 'best_stg1.pth'))
+                    print(f'Refresh EMA at epoch {epoch} with decay {self.ema.decay}')
+
+
+            log_stats = {
+                **{f'train_{k}': v for k, v in train_stats.items()},
+                **{f'test_{k}': v for k, v in test_stats.items()},
+                'epoch': epoch,
+                'n_parameters': n_parameters
+            }
+
+            if self.output_dir and dist_utils.is_main_process():
+                with (self.output_dir / "log.txt").open("a") as f:
+                    f.write(json.dumps(log_stats) + "\n")
+
+                # for evaluation logs
+                if coco_evaluator is not None:
+                    (self.output_dir / 'eval').mkdir(exist_ok=True)
+                    if "bbox" in coco_evaluator.coco_eval:
+                        filenames = ['latest.pth']
+                        if epoch % 50 == 0:
+                            filenames.append(f'{epoch:03}.pth')
+                        for name in filenames:
+                            torch.save(coco_evaluator.coco_eval["bbox"].eval,
+                                    self.output_dir / "eval" / name)
+
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('Training time {}'.format(total_time_str))
+
+
+    def val(self, ):
+        self.eval()
+
+        module = self.ema.module if self.ema else self.model
+        test_stats, coco_evaluator = evaluate(module, self.criterion, self.postprocessor,
+                self.val_dataloader, self.evaluator, self.device)
+
+        if self.output_dir:
+            dist_utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, self.output_dir / "eval.pth")
+
+        return
diff --git a/deim/_engine/train.py b/deim/_engine/train.py
new file mode 100644
index 00000000..69b9c7fa
--- /dev/null
+++ b/deim/_engine/train.py
@@ -0,0 +1,130 @@
+"""
+DEIM: DETR with Improved Matching for Fast Convergence
+Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright (c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
+import argparse
+from pathlib import Path
+
+from engine.core import YAMLConfig, yaml_utils
+from engine.misc import dist_utils
+from engine.solver import TASKS
+
+debug = False
+
+if debug:
+    import torch
+
+    def custom_repr(self):
+        return f"{{Tensor:{tuple(self.shape)}}} {original_repr(self)}"
+
+    original_repr = torch.Tensor.__repr__
+    torch.Tensor.__repr__ = custom_repr
+
+
+def main(
+    args,
+) -> None:
+    """main"""
+    dist_utils.setup_distributed(args.print_rank, args.print_method, seed=args.seed)
+
+    assert not all([args.tuning, args.resume]), (
+        "Only support from_scrach or resume or tuning at one time"
+    )
+
+    update_dict = yaml_utils.parse_cli(args.update)
+    update_dict.update({
+        k: v
+        for k, v in args.__dict__.items()
+        if k
+        not in [
+            "update",
+        ]
+        and v is not None
+    })
+
+    cfg = YAMLConfig(args.config, **update_dict)
+
+    if args.resume or args.tuning:
+        if "HGNetv2" in cfg.yaml_cfg:
+            cfg.yaml_cfg["HGNetv2"]["pretrained"] = False
+
+    print("cfg: ", cfg.__dict__)
+
+    solver = TASKS[cfg.yaml_cfg["task"]](cfg)
+
+    if args.test_only:
+        solver.val()
+    else:
+        solver.fit()
+
+    dist_utils.cleanup()
+
+
+def path_exists(path: str):
+    """
+    Checks if the given path exists.
+
+    Args:
+        path (str): The path to check.
+
+    Returns:
+        str: The original path if it exists, otherwise raises a FileNotFoundError.
+    """
+    if not Path(path).exists():
+        raise FileNotFoundError(f"Path not found: {path}")
+    return path
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # priority 0
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=path_exists,
+        required=True,
+        help="Path to the configuration file",
+    )
+    parser.add_argument(
+        "-r", "--resume", type=path_exists, help="Path to resume from checkpoint"
+    )
+    parser.add_argument(
+        "-t", "--tuning", type=path_exists, help="Path to tuning checkpoint"
+    )
+    parser.add_argument("-d", "--device", type=str, help="Device to use")
+    parser.add_argument("--seed", type=int, help="Seed for experiment reproducibility")
+    parser.add_argument(
+        "--use-amp", action="store_true", help="Enable auto mixed precision training"
+    )
+    parser.add_argument("--output-dir", type=str, help="Path to the output directory")
+    parser.add_argument(
+        "--summary-dir", type=str, help="Path to the TensorBoard summary directory"
+    )
+    parser.add_argument(
+        "--test-only", action="store_true", default=False, help="Only run testing"
+    )
+
+    # priority 1
+    parser.add_argument("-u", "--update", nargs="+", help="Update YAML config")
+
+    # env
+    parser.add_argument(
+        "--print-method", type=str, default="builtin", help="Print method"
+    )
+    parser.add_argument(
+        "--print-rank", type=int, default=0, help="Rank ID for printing"
+    )
+
+    parser.add_argument("--local-rank", type=int, help="Local rank ID")
+    args = parser.parse_args()
+    
+    main(args)
diff --git a/deim/_engine/weight/hgnetv2/PPHGNetV2_B0_stage1.pth b/deim/_engine/weight/hgnetv2/PPHGNetV2_B0_stage1.pth
new file mode 100644
index 00000000..58661a87
Binary files /dev/null and b/deim/_engine/weight/hgnetv2/PPHGNetV2_B0_stage1.pth differ
diff --git a/deim/_utils/__init__.py b/deim/_utils/__init__.py
new file mode 100644
index 00000000..3a320ebc
--- /dev/null
+++ b/deim/_utils/__init__.py
@@ -0,0 +1,7 @@
+"""Utility modules for DEIM"""
+
+from .visualizer import Visualizer
+from .logger import Logger
+from .metrics import calculate_metrics
+
+__all__ = ['Visualizer', 'Logger', 'calculate_metrics']
\ No newline at end of file
diff --git a/deim/_utils/logger.py b/deim/_utils/logger.py
new file mode 100644
index 00000000..263e5133
--- /dev/null
+++ b/deim/_utils/logger.py
@@ -0,0 +1,74 @@
+"""
+Logging utilities for DEIM training and inference
+"""
+
+import logging
+import sys
+from pathlib import Path
+from typing import Optional
+
+
+class Logger:
+    """
+    Logger for DEIM operations
+
+    Provides consistent logging across training and inference
+    """
+
+    def __init__(self,
+                 name: str = 'DEIM',
+                 log_file: Optional[str] = None,
+                 level: int = logging.INFO):
+        """
+        Initialize logger
+
+        Args:
+            name: Logger name
+            log_file: Optional file to save logs
+            level: Logging level
+        """
+
+        self.logger = logging.getLogger(name)
+        self.logger.setLevel(level)
+
+        # Clear any existing handlers
+        self.logger.handlers = []
+
+        # Console handler
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(level)
+
+        # Format
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        console_handler.setFormatter(formatter)
+        self.logger.addHandler(console_handler)
+
+        # File handler if specified
+        if log_file:
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setLevel(level)
+            file_handler.setFormatter(formatter)
+            self.logger.addHandler(file_handler)
+
+    def info(self, message: str):
+        """Log info message"""
+        self.logger.info(message)
+
+    def warning(self, message: str):
+        """Log warning message"""
+        self.logger.warning(message)
+
+    def error(self, message: str):
+        """Log error message"""
+        self.logger.error(message)
+
+    def debug(self, message: str):
+        """Log debug message"""
+        self.logger.debug(message)
+
+    def critical(self, message: str):
+        """Log critical message"""
+        self.logger.critical(message)
\ No newline at end of file
diff --git a/deim/_utils/metrics.py b/deim/_utils/metrics.py
new file mode 100644
index 00000000..d0c78751
--- /dev/null
+++ b/deim/_utils/metrics.py
@@ -0,0 +1,124 @@
+"""
+Metrics calculation for DEIM
+Evaluation utilities for object detection
+"""
+
+import torch
+import numpy as np
+from typing import Dict, List, Tuple, Any
+
+
+def calculate_metrics(predictions: List[Dict],
+                     targets: List[Dict],
+                     iou_threshold: float = 0.5) -> Dict[str, float]:
+    """
+    Calculate object detection metrics
+
+    Args:
+        predictions: List of prediction dictionaries
+        targets: List of target dictionaries
+        iou_threshold: IoU threshold for matching
+
+    Returns:
+        Dictionary with metrics (mAP, precision, recall, etc.)
+    """
+
+    # Simple placeholder implementation
+    # In production, would use COCO evaluation metrics
+
+    total_tp = 0  # True positives
+    total_fp = 0  # False positives
+    total_fn = 0  # False negatives
+
+    for pred, target in zip(predictions, targets):
+        pred_boxes = pred.get('boxes', torch.empty(0, 4))
+        pred_scores = pred.get('scores', torch.empty(0))
+        pred_labels = pred.get('labels', torch.empty(0))
+
+        target_boxes = target.get('boxes', torch.empty(0, 4))
+        target_labels = target.get('labels', torch.empty(0))
+
+        # Calculate IoU between predictions and targets
+        if len(pred_boxes) > 0 and len(target_boxes) > 0:
+            ious = calculate_iou(pred_boxes, target_boxes)
+
+            # Match predictions to targets
+            matched_targets = set()
+
+            for i in range(len(pred_boxes)):
+                max_iou = ious[i].max() if len(ious[i]) > 0 else 0
+                max_idx = ious[i].argmax() if len(ious[i]) > 0 else -1
+
+                if max_iou > iou_threshold and max_idx not in matched_targets:
+                    # Check if labels match
+                    if pred_labels[i] == target_labels[max_idx]:
+                        total_tp += 1
+                        matched_targets.add(max_idx.item() if torch.is_tensor(max_idx) else max_idx)
+                    else:
+                        total_fp += 1
+                else:
+                    total_fp += 1
+
+            # Count false negatives (unmatched targets)
+            total_fn += len(target_boxes) - len(matched_targets)
+
+        elif len(pred_boxes) > 0:
+            # All predictions are false positives
+            total_fp += len(pred_boxes)
+        elif len(target_boxes) > 0:
+            # All targets are false negatives
+            total_fn += len(target_boxes)
+
+    # Calculate metrics
+    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
+    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
+    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+
+    return {
+        'precision': precision,
+        'recall': recall,
+        'f1': f1,
+        'true_positives': total_tp,
+        'false_positives': total_fp,
+        'false_negatives': total_fn
+    }
+
+
+def calculate_iou(boxes1: torch.Tensor,
+                  boxes2: torch.Tensor) -> torch.Tensor:
+    """
+    Calculate IoU between two sets of boxes
+
+    Args:
+        boxes1: Tensor of shape (N, 4)
+        boxes2: Tensor of shape (M, 4)
+
+    Returns:
+        IoU matrix of shape (N, M)
+    """
+
+    # Ensure tensors
+    if not torch.is_tensor(boxes1):
+        boxes1 = torch.tensor(boxes1)
+    if not torch.is_tensor(boxes2):
+        boxes2 = torch.tensor(boxes2)
+
+    # Calculate intersection
+    x1 = torch.max(boxes1[:, None, 0], boxes2[:, 0])
+    y1 = torch.max(boxes1[:, None, 1], boxes2[:, 1])
+    x2 = torch.min(boxes1[:, None, 2], boxes2[:, 2])
+    y2 = torch.min(boxes1[:, None, 3], boxes2[:, 3])
+
+    intersection = torch.clamp(x2 - x1, min=0) * torch.clamp(y2 - y1, min=0)
+
+    # Calculate areas
+    area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
+    area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
+
+    # Calculate union
+    union = area1[:, None] + area2 - intersection
+
+    # Calculate IoU
+    iou = intersection / (union + 1e-6)
+
+    return iou
\ No newline at end of file
diff --git a/deim/_utils/visualizer.py b/deim/_utils/visualizer.py
new file mode 100644
index 00000000..2c3ae401
--- /dev/null
+++ b/deim/_utils/visualizer.py
@@ -0,0 +1,119 @@
+"""
+Visualization utilities for DEIM
+Uses supervision package for annotating detection results
+"""
+
+import numpy as np
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+
+
+class Visualizer:
+    """
+    Visualization handler for DEIM detections
+
+    Uses supervision package for high-quality annotations
+    """
+
+    def __init__(self, class_names: Optional[Dict[int, str]] = None):
+        """
+        Initialize visualizer
+
+        Args:
+            class_names: Dictionary mapping class IDs to names
+        """
+        self.class_names = class_names or {}
+
+        # Try to import supervision
+        try:
+            import supervision as sv
+            self.sv = sv
+            self.available = True
+
+            # Initialize annotators
+            self.box_annotator = sv.BoxAnnotator()
+            self.label_annotator = sv.LabelAnnotator(smart_position=True)
+
+            # Optional: mask annotator for segmentation
+            try:
+                self.mask_annotator = sv.MaskAnnotator()
+            except:
+                self.mask_annotator = None
+
+        except ImportError:
+            self.available = False
+            print("⚠️ Supervision not installed. Install with: pip install supervision")
+
+    def visualize(self,
+                  image: np.ndarray,
+                  detections: Dict[str, Any],
+                  conf_threshold: float = 0.4) -> np.ndarray:
+        """
+        Visualize detections on image
+
+        Args:
+            image: Input image as numpy array
+            detections: Detection results with 'boxes', 'scores', 'labels'
+            conf_threshold: Confidence threshold for display
+
+        Returns:
+            Annotated image
+        """
+
+        if not self.available:
+            print("Supervision not available. Returning original image.")
+            return image
+
+        try:
+            # Filter by confidence
+            if 'scores' in detections:
+                mask = detections['scores'] > conf_threshold
+                boxes = detections['boxes'][mask]
+                scores = detections['scores'][mask]
+                labels = detections['labels'][mask]
+            else:
+                boxes = detections['boxes']
+                scores = np.ones(len(boxes))
+                labels = detections.get('labels', np.zeros(len(boxes)))
+
+            # Create supervision Detections object
+            sv_detections = self.sv.Detections(
+                xyxy=boxes,
+                confidence=scores,
+                class_id=labels.astype(int)
+            )
+
+            # Create labels
+            labels_list = []
+            for class_id, score in zip(labels, scores):
+                class_name = self.class_names.get(int(class_id), f"Class {class_id}")
+                label = f"{class_name} {score:.2f}"
+                labels_list.append(label)
+
+            # Annotate image
+            annotated = image.copy()
+            annotated = self.box_annotator.annotate(
+                scene=annotated,
+                detections=sv_detections
+            )
+            annotated = self.label_annotator.annotate(
+                scene=annotated,
+                detections=sv_detections,
+                labels=labels_list
+            )
+
+            return annotated
+
+        except Exception as e:
+            print(f"Visualization error: {str(e)}")
+            return image
+
+    def save_visualization(self,
+                          image: np.ndarray,
+                          save_path: str):
+        """Save visualization to file"""
+        from PIL import Image
+
+        img = Image.fromarray(image)
+        img.save(save_path)
+        print(f"  Saved visualization: {save_path}")
\ No newline at end of file
diff --git a/deim/api.py b/deim/api.py
new file mode 100644
index 00000000..1e5722fe
--- /dev/null
+++ b/deim/api.py
@@ -0,0 +1,311 @@
+"""
+DEIM API - Simple interface for training and inference
+Similar to ultralytics YOLO but for DEIM models
+"""
+
+import os
+import sys
+from pathlib import Path
+from typing import Union, List, Optional, Dict, Any
+import datetime
+import torch
+
+# Add current directory to path for imports
+current_dir = Path(__file__).parent
+sys.path.insert(0, str(current_dir))
+
+from _core.config import ConfigManager
+from _core.trainer import Trainer
+from _core.predictor import Predictor
+
+
+class DEIM:
+    """
+    DEIM Model API - Simple interface for training and inference
+
+    Args:
+        config: Configuration name ('under', 'sides') or path to custom YAML
+        device: CUDA device (default: 'cuda:0', always GPU)
+
+    Examples:
+        >>> # Training from scratch
+        >>> model = DEIM(config='under')
+        >>> model.train(epochs=100, batch_size=32)
+
+        >>> # Training with pretrained weights
+        >>> model = DEIM(config='sides')
+        >>> model.train(pretrained='base_model.pth', epochs=50)
+
+        >>> # Inference
+        >>> model = DEIM(config='under')
+        >>> model.load('deim_outputs/under/20241002_143022/best_stg1.pth')
+        >>> results = model.predict('image.jpg', visualize=True)
+    """
+
+    def __init__(self, config: str = 'under', device: str = 'cuda:0'):
+        """Initialize DEIM model with config"""
+
+        # Always use GPU
+        if not torch.cuda.is_available():
+            raise RuntimeError("DEIM requires GPU. No GPU detected!")
+
+        self.device = torch.device(device)
+        self.config_name = config
+
+        # Initialize config manager
+        self.config_manager = ConfigManager(config)
+        self.cfg = self.config_manager.get_config()
+
+        # Model and predictor (lazy initialization)
+        self.model = None
+        self.predictor = None
+        self.trainer = None
+
+        print(f"✓ DEIM initialized with config: {config}")
+        print(f"  Device: {self.device}")
+
+    def train(self,
+              pretrained: Optional[str] = None,
+              epochs: Optional[int] = None,
+              batch_size: Optional[int] = None,
+              learning_rate: Optional[float] = None,
+              dataset_path: Optional[str] = None,
+              output_dir: Optional[str] = None,
+              **kwargs) -> Dict[str, Any]:
+        """
+        Train the DEIM model
+
+        Args:
+            pretrained: Path to pretrained weights or None for training from scratch
+            epochs: Number of training epochs (overrides config)
+            batch_size: Batch size (overrides config)
+            learning_rate: Learning rate (overrides config)
+            dataset_path: Custom dataset path (overrides config)
+            output_dir: Custom output directory (overrides config)
+            **kwargs: Additional training parameters
+
+        Returns:
+            Dictionary with training results and output paths
+
+        Examples:
+            >>> model = DEIM(config='under')
+            >>> # Train from scratch
+            >>> model.train(epochs=100, batch_size=32)
+
+            >>> # Train with pretrained weights
+            >>> model.train(pretrained='base_model.pth', epochs=50)
+
+            >>> # Custom dataset
+            >>> model.train(dataset_path='/path/to/dataset', epochs=100)
+        """
+
+        print("\n" + "="*60)
+        print("DEIM TRAINING")
+        print("="*60)
+
+        # Update config with overrides
+        overrides = {}
+        if epochs is not None:
+            overrides['epochs'] = epochs
+        if batch_size is not None:
+            overrides['batch_size'] = batch_size
+        if learning_rate is not None:
+            overrides['learning_rate'] = learning_rate
+        if dataset_path is not None:
+            overrides['dataset_path'] = dataset_path
+        if output_dir is not None:
+            overrides['output_dir'] = output_dir
+
+        # Add any additional kwargs
+        overrides.update(kwargs)
+
+        # Apply overrides
+        if overrides:
+            self.cfg = self.config_manager.apply_overrides(overrides)
+
+        # Initialize trainer
+        self.trainer = Trainer(
+            config=self.cfg,
+            device=self.device,
+            pretrained=pretrained
+        )
+
+        # Start training
+        print(f"\n📊 Training Configuration:")
+        print(f"  Config: {self.config_name}")
+        print(f"  Pretrained: {pretrained if pretrained else 'None (from scratch)'}")
+        print(f"  Epochs: {self.cfg.get('epochs', 100)}")
+        print(f"  Batch Size: {self.cfg.get('batch_size', 32)}")
+        print(f"  Learning Rate: {self.cfg.get('learning_rate', 0.001)}")
+
+        # Determine output base directory
+        # NOTE: Do NOT add timestamp here - the training engine (yaml_config.py)
+        # automatically appends a timestamp to output_dir. If we add one here too,
+        # we get nested timestamps like: deim_outputs/under/20251002_215916/20251002_215921/
+        if 'output_dir' in self.cfg:
+            # Use output_dir from config (e.g., 'deim_outputs/under')
+            output_dir = self.cfg['output_dir']
+        else:
+            # Use default structure based on config name
+            if self.config_name in ['under', 'sides']:
+                output_dir = f"deim_outputs/{self.config_name}"
+            else:
+                output_dir = "deim_outputs/custom"
+
+        # The training engine will create: {output_dir}/{timestamp}/
+        # e.g., deim_outputs/under/20251003_123045/
+        print(f"  Output Base: {output_dir}")
+
+        print("\n⚡ Starting training...")
+        print("  Note: This will take time. Monitor logs for progress.")
+
+        # Run training
+        results = self.trainer.train(output_dir=output_dir)
+
+        print(f"\n✅ Training complete!")
+        print(f"  Models saved to: {output_dir}/<timestamp>/")
+        print(f"  Note: Training engine creates timestamped subdirectory automatically")
+
+        return results
+
+    def load(self, checkpoint_path: str):
+        """
+        Load a trained model from checkpoint
+
+        Args:
+            checkpoint_path: Path to .pth checkpoint file
+
+        Examples:
+            >>> model = DEIM(config='under')
+            >>> model.load('deim_outputs/under/20241002_143022/best_stg1.pth')
+        """
+
+        if not Path(checkpoint_path).exists():
+            raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+
+        print(f"📥 Loading model from: {checkpoint_path}")
+
+        # Initialize predictor with loaded weights
+        self.predictor = Predictor(
+            config=self.cfg,
+            checkpoint_path=checkpoint_path,
+            device=self.device
+        )
+
+        print(f"✓ Model loaded successfully")
+
+    def predict(self,
+                source: Union[str, List[str]],
+                conf_threshold: float = 0.4,
+                visualize: bool = False,
+                save_path: Optional[str] = None,
+                save_dir: Optional[str] = None) -> List[Dict]:
+        """
+        Run inference on images or videos
+
+        Args:
+            source: Image path, video path, list of paths, or directory
+            conf_threshold: Confidence threshold for detections
+            visualize: Whether to visualize detections
+            save_path: Path to save single output (for single image/video)
+            save_dir: Directory to save batch outputs
+
+        Returns:
+            List of detection result dictionaries (always a list, even for single images)
+
+        Examples:
+            >>> # Single image
+            >>> results = model.predict('image.jpg', visualize=True)
+
+            >>> # Multiple images
+            >>> results = model.predict(['img1.jpg', 'img2.jpg'])
+
+            >>> # Directory
+            >>> results = model.predict('path/to/images/', save_dir='outputs/')
+
+            >>> # Video
+            >>> results = model.predict('video.mp4', visualize=True, save_path='output.mp4')
+        """
+
+        if self.predictor is None:
+            raise RuntimeError("No model loaded! Call .load() first or train a model")
+
+        print(f"\n🔍 Running inference...")
+
+        # Determine source type
+        if isinstance(source, str):
+            source_path = Path(source)
+
+            if source_path.is_dir():
+                # Directory of images
+                print(f"  Source: Directory ({source})")
+                image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
+                sources = [
+                    str(f) for f in source_path.glob('*')
+                    if f.suffix.lower() in image_extensions
+                ]
+                print(f"  Found {len(sources)} images")
+
+            elif source_path.suffix.lower() in ['.mp4', '.avi', '.mov']:
+                # Video file
+                print(f"  Source: Video ({source})")
+                sources = source
+
+            else:
+                # Single image
+                print(f"  Source: Image ({source})")
+                sources = source
+
+        elif isinstance(source, list):
+            # List of paths
+            print(f"  Source: List of {len(source)} items")
+            sources = source
+        else:
+            raise ValueError(f"Invalid source type: {type(source)}")
+
+        # Run prediction
+        results = self.predictor.predict(
+            sources=sources,
+            conf_threshold=conf_threshold,
+            visualize=visualize,
+            save_path=save_path,
+            save_dir=save_dir
+        )
+
+        print(f"✓ Inference complete")
+
+        # Always return a list for consistency
+        if not isinstance(results, list):
+            results = [results]
+
+        return results
+
+    def export(self, export_path: str, format: str = 'pytorch'):
+        """
+        Export model to different formats
+
+        Args:
+            export_path: Path to save exported model
+            format: Export format ('pytorch', 'onnx', 'torchscript')
+
+        Examples:
+            >>> model.export('model.pth', format='pytorch')
+            >>> model.export('model.onnx', format='onnx')
+        """
+
+        if self.predictor is None:
+            raise RuntimeError("No model loaded! Call .load() first")
+
+        print(f"📦 Exporting model to {format} format...")
+
+        if format == 'pytorch':
+            # Save PyTorch model
+            torch.save(self.predictor.model.state_dict(), export_path)
+            print(f"✓ Model exported to: {export_path}")
+
+        elif format == 'onnx':
+            # Export to ONNX (TODO: implement ONNX export)
+            print("⚠️  ONNX export not yet implemented")
+
+        else:
+            raise ValueError(f"Unsupported export format: {format}")
\ No newline at end of file
diff --git a/deim_outputs/best_models/sides/best_stg1.pth b/deim_outputs/best_models/sides/best_stg1.pth
new file mode 100644
index 00000000..1dcf63ec
Binary files /dev/null and b/deim_outputs/best_models/sides/best_stg1.pth differ
diff --git a/deim_outputs/best_models/sides/best_stg2.pth b/deim_outputs/best_models/sides/best_stg2.pth
new file mode 100644
index 00000000..488ed3ad
Binary files /dev/null and b/deim_outputs/best_models/sides/best_stg2.pth differ
diff --git a/deim_outputs/best_models/under/best_stg1.pth b/deim_outputs/best_models/under/best_stg1.pth
new file mode 100644
index 00000000..6c3f2d73
Binary files /dev/null and b/deim_outputs/best_models/under/best_stg1.pth differ
diff --git a/deim_outputs/best_models/under/best_stg2.pth b/deim_outputs/best_models/under/best_stg2.pth
new file mode 100644
index 00000000..ed25b774
Binary files /dev/null and b/deim_outputs/best_models/under/best_stg2.pth differ
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 00000000..1176b0b6
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,89 @@
+# CVAT Docker Setup
+
+Simple CVAT (Computer Vision Annotation Tool) setup for thermal image annotation.
+
+## Quick Start
+
+```bash
+# Start CVAT
+cd docker
+docker compose up -d
+
+# View logs
+docker compose logs -f
+
+# Stop CVAT
+docker compose down
+```
+
+## Access
+
+- **CVAT UI**: http://localhost:1280
+- **CVAT API**: http://localhost:12808
+
+## First Time Setup
+
+1. Start the containers:
+   ```bash
+   docker compose up -d
+   ```
+
+2. Wait for initialization (check logs):
+   ```bash
+   docker compose logs -f cvat
+   ```
+
+3. Create superuser account:
+   ```bash
+   docker compose exec cvat python3 ~/manage.py createsuperuser
+   ```
+
+4. Access CVAT at http://localhost:1280 and login
+
+## Data Persistence
+
+All data is stored in `./volumes/` (gitignored):
+- `cvat_db/` - PostgreSQL database
+- `cvat_data/` - Annotation data and uploaded images
+- `cvat_keys/` - Authentication keys
+- `cvat_logs/` - Application logs
+- `cvat_models/` - ML models (if used)
+
+## Container Architecture
+
+- **cvat_db**: PostgreSQL 15 database
+- **cvat_redis**: Redis cache
+- **cvat**: Main CVAT server
+- **cvat_ui**: Web UI (Nginx)
+- **cvat_worker_import**: Import task worker
+- **cvat_worker_export**: Export task worker
+- **cvat_worker_annotation**: Annotation processing worker
+- **cvat_worker_webhooks**: Webhook worker
+- **opa**: Open Policy Agent for authorization
+
+## Useful Commands
+
+```bash
+# View all containers
+docker compose ps
+
+# Restart CVAT
+docker compose restart cvat
+
+# View database logs
+docker compose logs cvat_db
+
+# Backup database
+docker compose exec cvat_db pg_dump -U root cvat > backup.sql
+
+# Clean everything (WARNING: deletes all data)
+docker compose down -v
+rm -rf volumes/*
+```
+
+## Notes
+
+- Uses CVAT v2.11.0 (stable release)
+- No auto-annotation/inference server configured yet
+- All volumes are gitignored for security
+- Redis and PostgreSQL run in network-isolated mode
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
new file mode 100644
index 00000000..129fe54c
--- /dev/null
+++ b/docker/docker-compose.yml
@@ -0,0 +1,178 @@
+version: '3.8'
+
+services:
+  cvat_db:
+    container_name: cvat_db
+    image: postgres:15-alpine
+    restart: always
+    environment:
+      POSTGRES_USER: root
+      POSTGRES_DB: cvat
+      POSTGRES_HOST_AUTH_METHOD: trust
+    volumes:
+      - ./volumes/cvat_db:/var/lib/postgresql/data
+    networks:
+      - cvat
+
+  cvat_redis:
+    container_name: cvat_redis
+    image: redis:7.0-alpine
+    restart: always
+    networks:
+      - cvat
+
+  cvat:
+    container_name: cvat
+    image: cvat/server:v2.11.0
+    restart: always
+    depends_on:
+      - cvat_db
+      - cvat_redis
+    environment:
+      DJANGO_MODWSGI_EXTRA_ARGS: ''
+      ALLOWED_HOSTS: '*'
+      CVAT_REDIS_HOST: cvat_redis
+      CVAT_POSTGRES_HOST: cvat_db
+      CVAT_POSTGRES_USER: root
+      CVAT_POSTGRES_DBNAME: cvat
+      CVAT_POSTGRES_PASSWORD: ''
+      ADAPTIVE_AUTO_ANNOTATION: 'false'
+      IAM_OPA_BUNDLE: '1'
+      no_proxy: elasticsearch,kibana,logstash,nuclio,opa,${no_proxy:-}
+      NUMPROCS: 1
+    command: -c supervisord/server.conf
+    volumes:
+      - ./volumes/cvat_data:/home/django/data
+      - ./volumes/cvat_keys:/home/django/keys
+      - ./volumes/cvat_logs:/home/django/logs
+      - ./volumes/cvat_models:/home/django/models
+    ports:
+      - "12808:8080"
+    networks:
+      - cvat
+
+  cvat_ui:
+    container_name: cvat_ui
+    image: cvat/ui:v2.11.0
+    restart: always
+    depends_on:
+      - cvat
+    networks:
+      - cvat
+    ports:
+      - "1280:80"
+
+  cvat_worker_import:
+    container_name: cvat_worker_import
+    image: cvat/server:v2.11.0
+    restart: always
+    depends_on:
+      - cvat_db
+      - cvat_redis
+    environment:
+      CVAT_REDIS_HOST: cvat_redis
+      CVAT_POSTGRES_HOST: cvat_db
+      CVAT_POSTGRES_USER: root
+      CVAT_POSTGRES_DBNAME: cvat
+      CVAT_POSTGRES_PASSWORD: ''
+      no_proxy: elasticsearch,kibana,logstash,nuclio,opa,${no_proxy:-}
+      NUMPROCS: 2
+    command: -c supervisord/worker.import.conf
+    volumes:
+      - ./volumes/cvat_data:/home/django/data
+      - ./volumes/cvat_keys:/home/django/keys
+      - ./volumes/cvat_logs:/home/django/logs
+      - ./volumes/cvat_models:/home/django/models
+    networks:
+      - cvat
+
+  cvat_worker_export:
+    container_name: cvat_worker_export
+    image: cvat/server:v2.11.0
+    restart: always
+    depends_on:
+      - cvat_db
+      - cvat_redis
+    environment:
+      CVAT_REDIS_HOST: cvat_redis
+      CVAT_POSTGRES_HOST: cvat_db
+      CVAT_POSTGRES_USER: root
+      CVAT_POSTGRES_DBNAME: cvat
+      CVAT_POSTGRES_PASSWORD: ''
+      no_proxy: elasticsearch,kibana,logstash,nuclio,opa,${no_proxy:-}
+      NUMPROCS: 2
+    command: -c supervisord/worker.export.conf
+    volumes:
+      - ./volumes/cvat_data:/home/django/data
+      - ./volumes/cvat_keys:/home/django/keys
+      - ./volumes/cvat_logs:/home/django/logs
+      - ./volumes/cvat_models:/home/django/models
+    networks:
+      - cvat
+
+  cvat_worker_annotation:
+    container_name: cvat_worker_annotation
+    image: cvat/server:v2.11.0
+    restart: always
+    depends_on:
+      - cvat_db
+      - cvat_redis
+    environment:
+      CVAT_REDIS_HOST: cvat_redis
+      CVAT_POSTGRES_HOST: cvat_db
+      CVAT_POSTGRES_USER: root
+      CVAT_POSTGRES_DBNAME: cvat
+      CVAT_POSTGRES_PASSWORD: ''
+      no_proxy: elasticsearch,kibana,logstash,nuclio,opa,${no_proxy:-}
+      NUMPROCS: 1
+    command: -c supervisord/worker.annotation.conf
+    volumes:
+      - ./volumes/cvat_data:/home/django/data
+      - ./volumes/cvat_keys:/home/django/keys
+      - ./volumes/cvat_logs:/home/django/logs
+      - ./volumes/cvat_models:/home/django/models
+    networks:
+      - cvat
+
+  cvat_worker_webhooks:
+    container_name: cvat_worker_webhooks
+    image: cvat/server:v2.11.0
+    restart: always
+    depends_on:
+      - cvat_db
+      - cvat_redis
+    environment:
+      CVAT_REDIS_HOST: cvat_redis
+      CVAT_POSTGRES_HOST: cvat_db
+      CVAT_POSTGRES_USER: root
+      CVAT_POSTGRES_DBNAME: cvat
+      CVAT_POSTGRES_PASSWORD: ''
+      no_proxy: elasticsearch,kibana,logstash,nuclio,opa,${no_proxy:-}
+      NUMPROCS: 1
+    command: -c supervisord/worker.webhooks.conf
+    volumes:
+      - ./volumes/cvat_data:/home/django/data
+      - ./volumes/cvat_keys:/home/django/keys
+      - ./volumes/cvat_logs:/home/django/logs
+      - ./volumes/cvat_models:/home/django/models
+    networks:
+      - cvat
+
+  opa:
+    container_name: cvat_opa
+    image: openpolicyagent/opa:0.45.0-rootless
+    restart: always
+    networks:
+      - cvat
+    command:
+      - run
+      - --server
+      - --set=services.cvat.url=http://cvat:8080
+      - --set=bundles.cvat.service=cvat
+      - --set=bundles.cvat.resource=/api/auth/rules
+      - --set=bundles.cvat.polling.min_delay_seconds=5
+      - --set=bundles.cvat.polling.max_delay_seconds=15
+
+networks:
+  cvat:
+    driver: bridge
diff --git a/docker/volumes/.gitignore b/docker/volumes/.gitignore
new file mode 100644
index 00000000..dadf22ba
--- /dev/null
+++ b/docker/volumes/.gitignore
@@ -0,0 +1,3 @@
+# Ignore all volume data (persistent storage)
+*
+!.gitignore
diff --git a/docs/CONFIGURATION_REFERENCE.md b/docs/CONFIGURATION_REFERENCE.md
new file mode 100644
index 00000000..2f5c6f6d
--- /dev/null
+++ b/docs/CONFIGURATION_REFERENCE.md
@@ -0,0 +1,777 @@
+# DEIM Configuration Reference
+
+**Complete technical reference** for all DEIM configuration parameters and training options.
+
+> 💡 **New to DEIM?** Start with [QUICKSTART.md](QUICKSTART.md) for a simple 3-step guide to get started quickly.
+
+This document provides in-depth explanations of all configuration files, parameters, and customization options for training DEIM on custom datasets.
+
+---
+
+## Step 1: Prepare Your Dataset
+
+DEIM uses **COCO format** for object detection. Your dataset folder should look like:
+
+```
+my_dataset/
+├── train/
+│   └── images/
+│       ├── img_001.jpg
+│       ├── img_002.jpg
+│       └── ...
+├── val/
+│   └── images/
+│       ├── img_100.jpg
+│       └── ...
+└── annotations/
+    ├── instances_train.json
+    └── instances_val.json
+```
+
+### COCO Annotation Format
+
+The `instances_train.json` and `instances_val.json` files should follow this structure:
+
+```json
+{
+  "images": [
+    {
+      "id": 1,
+      "file_name": "img_001.jpg",
+      "width": 640,
+      "height": 480
+    }
+  ],
+  "annotations": [
+    {
+      "id": 1,
+      "image_id": 1,
+      "category_id": 0,
+      "bbox": [x, y, width, height],
+      "area": 12800,
+      "iscrowd": 0
+    }
+  ],
+  "categories": [
+    {
+      "id": 0,
+      "name": "my_class",
+      "supercategory": "object"
+    }
+  ]
+}
+```
+
+**Note**: COCO bbox format is `[x, y, width, height]` where `(x, y)` is the **top-left corner**.
+
+### Converting from YOLO Format
+
+If you have YOLO format annotations, you can convert them using tools like:
+- [YOLO to COCO converter](https://github.com/Taeyoung96/Yolo-to-COCO-format-converter)
+- Or write a simple conversion script
+
+---
+
+## Step 2: Create Configuration Files
+
+DEIM uses a **modular config system** with base configs and imports. You'll need to create **3 files**:
+
+1. **`_base/dataset_*.yml`** - Dataset paths and class definitions
+2. **`_base/dataloader_*.yml`** - Data augmentation and preprocessing (the most important for tuning!)
+3. **`*.yml`** - Main config that imports and combines everything
+
+This separation allows you to:
+- ✅ Reuse augmentation strategies across different models
+- ✅ Keep related settings together
+- ✅ Override specific values without duplicating everything
+
+### 2.1 Create Base Dataset Config
+
+**File**: `deim/_configs/_base/dataset_my_dataset.yml`
+
+```yaml
+# Dataset configuration for 'my_dataset' detection
+
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ["bbox"]
+
+# Number of classes in your dataset
+num_classes: 3  # e.g., 3 classes: cat, dog, bird
+
+# Class name mappings for visualization
+class_names:
+  0: cat
+  1: dog
+  2: bird
+
+# Set to True if using COCO-pretrained models
+remap_mscoco_category: False
+
+# Training data
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /path/to/my_dataset/train/images
+    ann_file: /path/to/my_dataset/annotations/instances_train.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~  # Will be defined in dataloader config
+  shuffle: True
+  num_workers: 4
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+# Validation data
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /path/to/my_dataset/val/images
+    ann_file: /path/to/my_dataset/annotations/instances_val.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 4
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
+```
+
+### 2.2 Create Dataloader Config
+
+**File**: `deim/_configs/_base/dataloader_my_dataset.yml`
+
+```yaml
+# Data augmentation and preprocessing for training
+
+train_dataloader:
+  total_batch_size: 32  # Adjust based on your GPU memory
+  num_workers: 4
+  dataset:
+    transforms:
+      type: Compose
+      ops:
+        # Data augmentation (applied during training)
+        - type: RandomPhotometricDistort
+          p: 0.5
+        - type: RandomZoomOut
+          fill: 0
+        - type: RandomIoUCrop
+          p: 0.8
+        - type: SanitizeBoundingBoxes
+          min_size: 1
+        - type: RandomHorizontalFlip
+
+        # Optional: Advanced augmentations
+        - type: GaussianBlur
+          kernel_size: [3, 5]
+          sigma: [0.1, 2.0]
+          p: 0.3
+        - type: RandomRotation
+          degrees: 10
+          p: 0.5
+        - type: RandomPerspective
+          distortion_scale: 0.2
+          p: 0.3
+        - type: RandomAdjustSharpness
+          sharpness_factor: 2
+          p: 0.3
+
+        # Final resize and normalization
+        - type: Resize
+          size: [640, 640]
+        - type: SanitizeBoundingBoxes
+          min_size: 1
+        - type: ConvertPILImage
+          dtype: float32
+          scale: true
+        - type: ConvertBoxes
+          fmt: cxcywh
+          normalize: true
+
+      # Stop heavy augmentations after certain epoch
+      policy:
+        name: stop_epoch
+        epoch: 200  # Stop augmentations at epoch 200
+        ops:
+          - Mosaic
+          - RandomPhotometricDistort
+          - RandomZoomOut
+          - RandomIoUCrop
+          - GaussianBlur
+          - RandomRotation
+          - RandomPerspective
+          - RandomAdjustSharpness
+
+  collate_fn:
+    type: BatchImageCollateFunction
+    base_size: 640
+    stop_epoch: 200
+
+# Validation dataloader (minimal preprocessing)
+val_dataloader:
+  total_batch_size: 128  # Can be larger since no gradients
+  num_workers: 4
+```
+
+### 2.3 Create Main Config
+
+**File**: `deim/_configs/my_dataset.yml`
+
+```yaml
+# DEIM Configuration for 'my_dataset' detection
+
+__include__:
+  [
+    "./_base/dataset_my_dataset.yml",
+    "./_base/runtime.yml",
+    "./_base/dfine_hgnetv2.yml",
+    "./_base/dataloader_my_dataset.yml",
+    "./_base/optimizer.yml",
+  ]
+
+output_dir: ./deim_outputs/my_dataset
+
+# Model architecture
+DEIM:
+  backbone: HGNetv2  # Options: HGNetv2, ResNet50, CSPDarkNet
+
+# Backbone configuration
+HGNetv2:
+  name: "B0"  # Options: B0 (fastest), B1, B2, B3, B4, B5, B6 (most accurate)
+  return_idx: [2, 3]
+  freeze_at: -1  # -1 = no freezing, 0 = freeze all, N = freeze first N stages
+  freeze_norm: False
+  use_lab: True  # Use LAB color space normalization
+
+# Encoder configuration
+HybridEncoder:
+  in_channels: [512, 1024]  # Must match backbone output channels
+  feat_strides: [16, 32]
+  hidden_dim: 128  # Lower = faster, higher = more accurate
+  use_encoder_idx: [1]
+  dim_feedforward: 512
+  expansion: 0.34
+  depth_mult: 0.5
+
+# Decoder configuration
+DFINETransformer:
+  feat_channels: [128, 128]
+  feat_strides: [16, 32]
+  hidden_dim: 128
+  dim_feedforward: 512
+  num_levels: 2
+  num_layers: 3  # More layers = better accuracy, slower training
+  eval_idx: -1
+  num_points: [6, 6]
+
+# Optimizer settings
+optimizer:
+  lr: 0.0008  # Learning rate - adjust based on batch size
+  # Rule of thumb: lr = 0.0001 * (batch_size / 16)
+```
+
+---
+
+## Step 3: Configuration Parameters Explained
+
+### Dataset Parameters
+
+| Parameter | Description | Example | Technical Details |
+|-----------|-------------|---------|-------------------|
+| `num_classes` | Number of object classes in your dataset | `3` for cat, dog, bird | Defines output dimension for classification head |
+| `class_names` | Mapping of class IDs to readable names | `{0: cat, 1: dog, 2: bird}` | Used for visualization in predict(); must match category_id in COCO JSON |
+| `img_folder` | Path to image directory | `/path/to/images/` | Absolute or relative path; must contain all images referenced in ann_file |
+| `ann_file` | Path to COCO JSON annotation file | `/path/to/instances.json` | COCO format: images, annotations, categories arrays |
+| `remap_mscoco_category` | Use COCO category mapping (only for COCO-pretrained) | `False` for custom datasets | `True` if using COCO's 80 classes, `False` for custom classes |
+| `return_masks` | Return instance segmentation masks | `False` | Set to `True` only for instance segmentation tasks |
+| `num_workers` | Data loading threads | `4-8` | Higher values speed up data loading; limit based on CPU cores |
+| `shuffle` | Shuffle data during training | `True` | Always `True` for training, `False` for validation |
+| `drop_last` | Drop incomplete batches | `True` | Prevents size mismatch issues during training |
+
+### Model Architecture Parameters
+
+#### Backbone Configuration (HGNetv2)
+
+| Parameter | Description | Values | Impact | GPU Memory |
+|-----------|-------------|---------|--------|------------|
+| `name` | Backbone size variant | `B0`, `B1`, `B2`, `B3`, `B4`, `B5`, `B6` | B0 fastest (~60 FPS), B6 most accurate | B0: ~6GB, B4: ~18GB (batch 32) |
+| `return_idx` | Feature pyramid levels to return | `[2, 3]` (default) | Controls multi-scale detection; typically last 2 stages |
+| `freeze_at` | Freeze backbone stages | `-1` (no freeze), `0` (all), `N` (first N) | Freezing speeds training but reduces adaptability |
+| `freeze_norm` | Freeze batch normalization layers | `False` | Set `True` for fine-tuning on small datasets |
+| `use_lab` | Use LAB color space normalization | `True` | LAB improves robustness to lighting variations |
+| `in_channels` | Backbone output channels | `[512, 1024]` for B0 | Must match backbone architecture |
+
+#### Encoder Configuration (HybridEncoder)
+
+| Parameter | Description | Default | Impact |
+|-----------|-------------|---------|--------|
+| `hidden_dim` | Feature embedding dimension | `128` | Higher = more capacity, slower inference |
+| `feat_strides` | Downsampling strides | `[16, 32]` | Matches backbone output strides |
+| `use_encoder_idx` | Which encoder stages to use | `[1]` | Index of feature pyramid levels |
+| `dim_feedforward` | FFN dimension | `512` | Higher = more expressiveness, slower |
+| `expansion` | Channel expansion ratio | `0.34` | Controls CSPNet expansion |
+| `depth_mult` | Depth multiplier | `0.5` | Scales number of layers |
+
+#### Decoder Configuration (DFINETransformer)
+
+| Parameter | Description | Default | Impact |
+|-----------|-------------|---------|--------|
+| `num_layers` | Decoder transformer layers | `3` | More layers = better accuracy, slower training |
+| `num_queries` | Maximum detections per image | `300` | Higher = more detections, slower inference |
+| `hidden_dim` | Decoder feature dimension | `128` | Must match encoder hidden_dim |
+| `dim_feedforward` | FFN dimension | `512` | Higher = more capacity |
+| `num_levels` | Feature pyramid levels | `2` | Matches encoder output levels |
+| `num_points` | Deformable attention points | `[6, 6]` | More points = finer localization |
+| `eval_idx` | Which decoder layer to evaluate | `-1` (last) | Use -1 for best predictions |
+
+### Training Parameters
+
+| Parameter | Description | Typical Values | How to Tune |
+|-----------|-------------|----------------|-------------|
+| `epoches` | Total training epochs | `100-300` | Small datasets: 200-300, Large: 100-150 |
+| `total_batch_size` | Batch size (across all GPUs) | `16-64` | Limited by GPU memory; higher = more stable |
+| `lr` | Learning rate | `0.0001-0.001` | Rule of thumb: `0.0001 * (batch_size / 16)` |
+| `betas` | Adam optimizer betas | `[0.9, 0.999]` | Rarely need to change |
+| `weight_decay` | L2 regularization | `0.0001` | Higher prevents overfitting on small datasets |
+| `clip_max_norm` | Gradient clipping | `0.1` | Prevents exploding gradients |
+
+### Data Augmentation Parameters (Advanced)
+
+#### Core Augmentations
+
+| Transform | Purpose | Parameters | When to Use | When NOT to Use |
+|-----------|---------|------------|-------------|-----------------|
+| `RandomPhotometricDistort` | Color/brightness jittering | `p=0.5` | Always for robustness to lighting | Grayscale/thermal images |
+| `RandomZoomOut` | Creates scale variation | `fill=0` (background color) | Multi-scale objects | Fixed-size objects only |
+| `RandomIoUCrop` | Crops maintaining objects | `p=0.8` (crop probability) | Dense scenes, data augmentation | **Nested objects** (breaks hierarchy) |
+| `SanitizeBoundingBoxes` | Removes invalid boxes | `min_size=1` | Always (data integrity) | Never skip |
+| `RandomHorizontalFlip` | Mirror image horizontally | `p=0.5` | Symmetric scenes | **Asymmetric** (text, logos, arrows) |
+
+#### Advanced Augmentations
+
+| Transform | Purpose | Parameters | Domain-Specific Use Cases | Avoid When |
+|-----------|---------|------------|---------------------------|------------|
+| `GaussianBlur` | Simulates motion/focus blur | `kernel_size=[3,5]`, `sigma=[0.1,2.0]`, `p=0.3` | Low-quality cameras, motion blur, atmospheric effects (dust, fog) | High-resolution, sharp details critical |
+| `RandomRotation` | Rotation invariance | `degrees=10`, `p=0.5` | Aerial imagery, objects with arbitrary orientation | **Circular objects** (radially symmetric), gravity-dependent |
+| `RandomPerspective` | Camera angle variation | `distortion_scale=0.2`, `p=0.3` | 3D scenes, varying camera angles | **Circular features** (distorts geometry), 2D top-down views |
+| `RandomAdjustSharpness` | Sharpness variation | `sharpness_factor=2`, `p=0.3` | Mixed quality data, edge detection tasks | Uniformly sharp datasets |
+| `Mosaic` | 4-image mosaic | `mosaic_prob=0.5` | Small datasets (4x data), multi-scale | Large datasets (adds overhead) |
+
+#### Required Preprocessing (Never Remove)
+
+| Transform | Purpose | Parameters | Notes |
+|-----------|---------|------------|-------|
+| `Resize` | Normalize image size | `size=[640, 640]` | Must match model input size |
+| `ConvertPILImage` | Convert to tensor | `dtype='float32'`, `scale=True` | Scales [0,255] → [0,1] |
+| `ConvertBoxes` | Normalize box format | `fmt='cxcywh'`, `normalize=True` | DETR expects center coords |
+
+#### Augmentation Policy (Stop Heavy Augmentation)
+
+```yaml
+policy:
+  name: stop_epoch
+  epoch: 180  # Stop at 90% of total epochs (200 * 0.9)
+  ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
+```
+
+**Why stop augmentation?** Heavy augmentation early helps generalization, but final epochs benefit from clean data for fine-tuning.
+
+**💡 Pro Tip**: Document your augmentation reasoning with comments! See real production example below.
+
+### Real-World Example: Thermal Imaging for Truck Brakes
+
+Here's a production config with domain-specific reasoning (from `dataloader_under.yml`):
+
+```yaml
+train_dataloader:
+  dataset:
+    transforms:
+      ops:
+        # Thermal signature variations (hot brakes, hub heat)
+        - {type: RandomPhotometricDistort, p: 0.5}
+
+        # Simulates varying truck distances from camera
+        - {type: RandomZoomOut, fill: 0}
+
+        # REDUCED from 0.8 - less aggressive to preserve nested hierarchy
+        # (wheels → hubs → brakes are nested objects)
+        - {type: RandomIoUCrop, p: 0.3}
+
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+
+        # Left/right wheel symmetry
+        - {type: RandomHorizontalFlip}
+
+        # Heat shimmer from hot brakes, dust
+        - {type: GaussianBlur, kernel_size: [3, 5], sigma: [0.1, 2.0], p: 0.3}
+
+        # Critical for hub bolt details
+        - {type: RandomAdjustSharpness, sharpness_factor: 2, p: 0.3}
+
+        # REMOVED RandomRotation - circular wheels/hubs don't benefit (radially symmetric)
+        # REMOVED RandomPerspective - distorts circular features, hurts bolt pattern detection
+
+        - {type: Resize, size: [640, 640]}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+```
+
+**Key Insights**:
+- ✅ Comments explain the **domain context** (thermal, mining, trucks)
+- ✅ Explains **why** augmentations were reduced/removed (nested objects, circular geometry)
+- ✅ Links augmentations to **real-world phenomena** (heat shimmer, dust, varying distances)
+- ✅ Future maintainers understand the reasoning, not just the config
+
+---
+
+## Step 3.5: Advanced Configuration Topics
+
+### Optimizer Configuration (Advanced)
+
+The optimizer config supports **parameter-specific learning rates** using regex patterns:
+
+```yaml
+optimizer:
+  type: AdamW
+  lr: 0.0008  # Default learning rate
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+  # Parameter-specific overrides
+  params:
+    # Backbone gets 50% of base learning rate
+    - params: "^(?=.*backbone)(?!.*norm|bn).*$"
+      lr: 0.0004  # 0.5x base lr
+
+    # Backbone normalization layers: lower lr, no weight decay
+    - params: "^(?=.*backbone)(?=.*norm|bn).*$"
+      lr: 0.0004
+      weight_decay: 0.  # No regularization on norm layers
+
+    # Encoder/decoder normalization and bias: no weight decay
+    - params: "^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$"
+      weight_decay: 0.
+```
+
+**Why different learning rates?**
+- **Backbone**: Often pretrained, needs gentler updates
+- **Normalization layers**: Sensitive to weight decay, can destabilize training
+- **Bias terms**: Regularizing bias hurts performance, set weight_decay=0
+
+### Learning Rate Scheduling
+
+```yaml
+lr_scheduler:
+  type: MultiStepLR
+  milestones: [180, 240]  # Reduce lr at these epochs
+  gamma: 0.1  # Multiply lr by 0.1 at each milestone
+```
+
+**Common strategies**:
+- **MultiStepLR**: Step decay at fixed epochs
+- **CosineAnnealingLR**: Smooth decay following cosine curve
+- **ReduceLROnPlateau**: Reduce when validation metric plateaus
+
+### EMA (Exponential Moving Average)
+
+```yaml
+ema:
+  enabled: True
+  decay: 0.9999
+  warmup_epochs: 5
+```
+
+**Benefits**: Smoother convergence, better generalization, more stable validation metrics
+
+### Mixed Precision Training (AMP)
+
+Automatically enabled for GPU training. Speeds up training by ~2x and reduces memory by ~40%.
+
+### Validation Configuration
+
+```yaml
+val_dataloader:
+  total_batch_size: 64  # Can be larger (no gradients)
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640]}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+```
+
+**Key differences from training**:
+- No augmentations (only resize + convert)
+- Larger batch size (no backward pass)
+- No shuffling, no dropping last batch
+
+---
+
+## Step 4: Training Your Model
+
+```python
+from deim import DEIM
+
+# Initialize with your config
+model = DEIM(config='my_dataset')
+
+# Option 1: Train from scratch
+model.train(epochs=100)
+
+# Option 2: Fine-tune from pretrained weights
+model.train(
+    pretrained='deim_outputs/under/20251002_215916/best_stg2.pth',
+    epochs=50
+)
+
+# Option 3: Override config parameters
+model.train(
+    epochs=200,
+    batch_size=32,
+    learning_rate=0.0005
+)
+```
+
+---
+
+## Step 5: Inference
+
+```python
+# Load trained model
+model.load('deim_outputs/my_dataset/best_stg2.pth')
+
+# Run inference
+results = model.predict('image.jpg', visualize=True)
+
+# Display results
+from PIL import Image
+for r in results:
+    display(Image.fromarray(r['visualization']))
+```
+
+---
+
+## Advanced Troubleshooting
+
+### 1. Out of Memory (OOM)
+
+**Symptoms**: `RuntimeError: CUDA out of memory`
+
+**Solutions** (in order of effectiveness):
+```yaml
+# Option 1: Reduce batch size (most effective)
+total_batch_size: 8  # Reduce from 16 or 32
+
+# Option 2: Use gradient accumulation (simulates larger batch)
+gradient_accumulation_steps: 4  # Effective batch = 8 * 4 = 32
+
+# Option 3: Smaller model
+HGNetv2:
+  name: "B0"  # Smallest backbone
+hidden_dim: 96  # Reduce from 128
+
+# Option 4: Lower image resolution
+- type: Resize
+  size: [512, 512]  # Reduce from 640x640
+```
+
+**GPU Memory Guide** (batch size 16):
+- B0 + 128 dim: ~4GB
+- B0 + 256 dim: ~6GB
+- B2 + 256 dim: ~10GB
+- B4 + 256 dim: ~16GB
+
+### 2. Slow Training
+
+**Symptoms**: <0.5 iterations/second, long epoch times
+
+**Diagnosis**:
+```python
+# Profile data loading vs GPU computation
+# If GPU utilization <80%, bottleneck is data loading
+```
+
+**Solutions**:
+```yaml
+# Data loading bottleneck
+num_workers: 8  # Increase from 4 (up to CPU cores)
+pin_memory: True  # Faster host-to-GPU transfer
+
+# Too many augmentations
+# Remove expensive transforms:
+# - type: Mosaic  # Most expensive
+# - type: RandomRotation  # Moderately expensive
+# - type: RandomPerspective  # Moderately expensive
+
+# GPU bottleneck (model too large)
+total_batch_size: 32  # Increase if memory allows
+# Or use smaller model (B0 instead of B2)
+```
+
+### 3. Poor Accuracy / Not Learning
+
+**Diagnostic Checklist**:
+
+| Issue | Symptom | Solution |
+|-------|---------|----------|
+| Bad annotations | mAP <10% after 50 epochs | Verify COCO JSON format, check bboxes |
+| Too few epochs | mAP improving but slow | Train 200-300 epochs for small datasets |
+| Learning rate too high | Loss diverges/NaN | Reduce lr to 0.0001-0.0005 |
+| Learning rate too low | Loss decreases very slowly | Increase lr to 0.001-0.002 |
+| Model too small | mAP plateaus at 30-40% | Use B2/B4 backbone, increase hidden_dim |
+| Overfitting | Train mAP high, val mAP low | Add augmentations, increase weight_decay |
+| Underfitting | Both train and val mAP low | Larger model, train longer, reduce regularization |
+| Class imbalance | Good on common classes, bad on rare | Use class weights, oversample rare classes |
+
+**Advanced debugging**:
+```yaml
+# Enable gradient clipping if loss explodes
+clip_max_norm: 0.1
+
+# Freeze backbone initially, then unfreeze
+HGNetv2:
+  freeze_at: 4  # Freeze all stages initially
+# After 50 epochs, set freeze_at: -1 and resume training
+```
+
+### 4. Bounding Box Issues
+
+**Symptoms**: Boxes in wrong location, wrong size, or missing
+
+**Coordinate format check**:
+- COCO format: `[x, y, width, height]` where (x,y) is **top-left**
+- DEIM internal: center format `(cx, cy, w, h)` normalized to [0,1]
+
+**Common issues**:
+```python
+# Verify annotations are correct
+from pycocotools.coco import COCO
+coco = COCO('annotations/instances_train.json')
+img_id = 1
+anns = coco.loadAnns(coco.getAnnIds(imgIds=[img_id]))
+# Check: bbox values reasonable? Within image bounds?
+```
+
+### 5. Classes Not Showing Names
+
+**Solution**: Add `class_names` to **source config**, not output config
+```yaml
+# File: deim/_configs/_base/dataset_my_dataset.yml (CORRECT)
+class_names:
+  0: my_class_1
+  1: my_class_2
+
+# NOT in deim_outputs/my_dataset/config.yml (output dir, won't work)
+```
+
+### 6. Model Not Loading Pretrained Weights
+
+**Symptoms**: Warning about missing/unexpected keys
+
+**Solutions**:
+```python
+# Check if num_classes matches
+# If pretrained on COCO (80 classes), you need to ignore classification head
+model.train(
+    pretrained='coco_model.pth',
+    strict_loading=False  # Ignore head mismatch
+)
+```
+
+### 7. NaN Loss
+
+**Causes and fixes**:
+```yaml
+# Cause 1: Learning rate too high
+optimizer:
+  lr: 0.0001  # Reduce from 0.001
+
+# Cause 2: No gradient clipping
+clip_max_norm: 0.1  # Add this
+
+# Cause 3: Mixed precision issues (rare)
+use_amp: False  # Disable AMP temporarily
+```
+
+---
+
+## Model Size vs Performance Guide
+
+| Backbone | Hidden Dim | Speed (FPS) | mAP | Use Case |
+|----------|-----------|-------------|-----|----------|
+| B0 | 96 | ~60 | ~40% | Real-time applications |
+| B0 | 128 | ~50 | ~45% | Balanced (default) |
+| B2 | 256 | ~30 | ~50% | High accuracy needed |
+| B4 | 256 | ~20 | ~55% | Maximum accuracy |
+
+**GPU Memory Usage** (batch size 32):
+- B0 + hidden_dim 128: ~6GB
+- B2 + hidden_dim 256: ~12GB
+- B4 + hidden_dim 256: ~18GB
+
+---
+
+## Example Configs by Use Case
+
+### Small Objects Detection (e.g., thermal sensors, PCB defects)
+```yaml
+HGNetv2:
+  name: "B2"  # Need more capacity
+hidden_dim: 256
+num_layers: 6  # More layers for fine details
+
+# More aggressive multi-scale augmentation
+train_dataloader:
+  dataset:
+    transforms:
+      ops:
+        - type: RandomZoomOut
+          fill: 0
+        - type: RandomIoUCrop
+          p: 0.9  # Increase crop probability
+```
+
+### Few Classes, High Accuracy (e.g., 1-3 classes)
+```yaml
+num_classes: 1
+HGNetv2:
+  name: "B4"  # Use largest backbone
+hidden_dim: 256
+num_layers: 6
+epoches: 300  # Train longer
+```
+
+### Fast Inference Required (e.g., real-time detection)
+```yaml
+HGNetv2:
+  name: "B0"
+hidden_dim: 96  # Minimal dimensions
+num_layers: 3
+num_queries: 100  # Fewer queries
+```
+
+---
+
+## Next Steps
+
+1. **Monitor Training**: Check `deim_outputs/my_dataset/` for logs and checkpoints
+2. **Evaluate**: Best model is automatically saved as `best_stg2.pth`
+3. **Tune Hyperparameters**: Adjust learning rate, augmentations based on results
+4. **Export for Production**: Use `model.export()` for deployment
+
+---
+
+## References
+
+- [COCO Dataset Format](https://cocodataset.org/#format-data)
+- [Data Augmentation Guide](https://pytorch.org/vision/stable/transforms.html)
+- [HGNetV2 Paper](https://arxiv.org/abs/2204.00993)
diff --git a/docs/FORMAT_CONVERSION.md b/docs/FORMAT_CONVERSION.md
new file mode 100644
index 00000000..0372eb60
--- /dev/null
+++ b/docs/FORMAT_CONVERSION.md
@@ -0,0 +1,220 @@
+# Dataset Format Conversion Guide
+
+## Overview
+
+This project uses **COCO format** as the primary annotation format, even though we train YOLO models. This decision is based on:
+
+1. **Multi-Model Testing**: We evaluate multiple detection architectures (YOLO, RT-DETR, D-FINE, etc.), many of which natively expect COCO format
+2. **Standardization**: COCO is the industry standard for object detection datasets
+3. **Rich Metadata**: COCO format preserves dataset splits, category information, and annotation metadata in a structured way
+4. **Framework Compatibility**: Easier integration with various frameworks (Detectron2, MMDetection, Ultralytics)
+5. **Evaluation Metrics**: COCO provides standardized evaluation metrics (mAP, AR) used across the research community
+
+## Format Comparison
+
+| Aspect | COCO Format | YOLO Format |
+|--------|-------------|-------------|
+| **File Structure** | Single JSON per split | One .txt per image |
+| **Coordinates** | Absolute pixels | Normalized (0-1) |
+| **Metadata** | Rich (categories, licenses, info) | Minimal |
+| **Training Speed** | Slower parsing | Faster parsing |
+| **Debugging** | Harder (large JSON) | Easier (per-image text) |
+| **Multi-Framework** | ✅ Wide support | ❌ YOLO-specific |
+| **Best For** | Benchmarking, research | YOLO training only |
+
+## Bidirectional Conversion
+
+### COCO → YOLO (Using Ultralytics)
+
+Ultralytics provides built-in conversion from COCO to YOLO format:
+
+```python
+from ultralytics.data.converter import convert_coco
+
+# Basic conversion
+convert_coco(
+    labels_dir="path/to/coco/annotations/",  # Directory with instances_train2017.json, etc.
+    save_dir="path/to/output/yolo_labels/",  # Output directory for YOLO .txt files
+    use_segments=False,  # Set True for segmentation tasks
+    use_keypoints=False,  # Set True for pose estimation
+    cls91to80=False,  # Set True to map COCO 91 classes to 80
+)
+
+# With segmentation masks
+convert_coco(
+    labels_dir="datasets/coco/annotations/",
+    save_dir="datasets/coco_yolo/labels/",
+    use_segments=True,
+)
+```
+
+**Output Structure:**
+```
+yolo_labels/
+├── train/
+│   ├── image1.txt
+│   ├── image2.txt
+│   └── ...
+└── val/
+    ├── image1.txt
+    └── ...
+```
+
+**YOLO Format (per line):**
+```
+class_id center_x center_y width height
+0 0.716797 0.395833 0.216406 0.147222
+```
+
+### YOLO → COCO (Using External Tool)
+
+For YOLO to COCO conversion, use the [Yolo-to-COCO-format-converter](https://github.com/Taeyoung96/Yolo-to-COCO-format-converter):
+
+```bash
+# Clone the repository
+git clone https://github.com/Taeyoung96/Yolo-to-COCO-format-converter.git
+cd Yolo-to-COCO-format-converter
+
+# Install dependencies
+pip install -r requirements.txt
+```
+
+**Usage:**
+
+```python
+import os
+from Yolo_to_COCO import yolo_to_coco
+
+# Define paths
+yolo_labels_dir = "path/to/yolo/labels/"
+images_dir = "path/to/images/"
+output_json = "path/to/output/coco_annotations.json"
+
+# Define class names (must match your YOLO classes)
+class_names = ["class1", "class2", "class3"]
+
+# Convert
+yolo_to_coco(
+    yolo_labels_dir=yolo_labels_dir,
+    images_dir=images_dir,
+    output_json=output_json,
+    class_names=class_names,
+)
+```
+
+**Alternative: Manual Conversion Script**
+
+```python
+import json
+import os
+from pathlib import Path
+from PIL import Image
+
+def yolo_to_coco(yolo_dir, images_dir, class_names, output_json):
+    """
+    Convert YOLO format annotations to COCO format.
+
+    Args:
+        yolo_dir: Directory containing YOLO .txt files
+        images_dir: Directory containing corresponding images
+        class_names: List of class names in order
+        output_json: Output path for COCO JSON file
+    """
+    coco_format = {
+        "images": [],
+        "annotations": [],
+        "categories": []
+    }
+
+    # Create categories
+    for idx, name in enumerate(class_names):
+        coco_format["categories"].append({
+            "id": idx,
+            "name": name,
+            "supercategory": "none"
+        })
+
+    annotation_id = 0
+
+    # Process each image
+    for img_id, txt_file in enumerate(sorted(Path(yolo_dir).glob("*.txt"))):
+        # Get corresponding image
+        img_name = txt_file.stem
+        img_path = None
+        for ext in ['.jpg', '.jpeg', '.png']:
+            candidate = Path(images_dir) / f"{img_name}{ext}"
+            if candidate.exists():
+                img_path = candidate
+                break
+
+        if not img_path:
+            continue
+
+        # Get image dimensions
+        img = Image.open(img_path)
+        width, height = img.size
+
+        # Add image info
+        coco_format["images"].append({
+            "id": img_id,
+            "file_name": img_path.name,
+            "width": width,
+            "height": height
+        })
+
+        # Read YOLO annotations
+        with open(txt_file, 'r') as f:
+            for line in f.readlines():
+                parts = line.strip().split()
+                if len(parts) < 5:
+                    continue
+
+                class_id = int(parts[0])
+                center_x = float(parts[1]) * width
+                center_y = float(parts[2]) * height
+                bbox_width = float(parts[3]) * width
+                bbox_height = float(parts[4]) * height
+
+                # Convert to COCO format (top-left x, y, width, height)
+                x = center_x - bbox_width / 2
+                y = center_y - bbox_height / 2
+
+                coco_format["annotations"].append({
+                    "id": annotation_id,
+                    "image_id": img_id,
+                    "category_id": class_id,
+                    "bbox": [x, y, bbox_width, bbox_height],
+                    "area": bbox_width * bbox_height,
+                    "iscrowd": 0
+                })
+                annotation_id += 1
+
+    # Save COCO JSON
+    with open(output_json, 'w') as f:
+        json.dump(coco_format, f, indent=2)
+
+    print(f"Conversion complete: {len(coco_format['images'])} images, "
+          f"{len(coco_format['annotations'])} annotations")
+
+# Example usage
+yolo_to_coco(
+    yolo_dir="datasets/yolo/labels/train/",
+    images_dir="datasets/yolo/images/train/",
+    class_names=["damage", "corrosion", "crack"],
+    output_json="datasets/coco/annotations/instances_train.json"
+)
+```
+
+## Workflow in This Project
+
+1. **Annotation**: Create annotations in COCO format using tools like CVAT, LabelImg, or Roboflow
+2. **Storage**: Keep master dataset in COCO format (`deim/_configs/_base/dataset_*.yml`)
+3. **Training**: When training YOLO models, Ultralytics automatically handles COCO format
+4. **Conversion** (if needed): Use `convert_coco()` if you need explicit YOLO .txt files for custom pipelines
+
+## References
+
+- [Ultralytics Converter Documentation](https://docs.ultralytics.com/reference/data/converter/)
+- [Yolo-to-COCO-format-converter](https://github.com/Taeyoung96/Yolo-to-COCO-format-converter)
+- [COCO Dataset Format](https://cocodataset.org/#format-data)
+- [YOLO Format Specification](https://docs.ultralytics.com/datasets/detect/)
diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md
new file mode 100644
index 00000000..a0297d90
--- /dev/null
+++ b/docs/QUICKSTART.md
@@ -0,0 +1,310 @@
+# DEIM Quick Start - Custom Dataset Training
+
+Train DEIM on your own dataset in 3 steps. See [CONFIGURATION_REFERENCE.md](CONFIGURATION_REFERENCE.md) for detailed explanations.
+
+---
+
+## Step 1: Prepare Your Dataset (COCO Format)
+
+```
+my_dataset/
+├── train/images/          # Training images
+├── val/images/            # Validation images
+└── annotations/
+    ├── instances_train.json
+    └── instances_val.json
+```
+
+**COCO JSON format**: Use existing tools to convert YOLO/other formats to COCO.
+
+---
+
+## Step 2: Create 3 Config Files
+
+Copy and modify the `under` configs as templates:
+
+### 2.1 Dataset Config
+**File**: `deim/_configs/_base/dataset_my_dataset.yml`
+
+```yaml
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ["bbox"]
+
+# CHANGE THESE:
+num_classes: 3  # Your number of classes
+class_names:    # Your class names for visualization
+  0: cat
+  1: dog
+  2: bird
+
+remap_mscoco_category: False  # False for custom datasets
+
+# CHANGE THESE PATHS:
+train_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /path/to/my_dataset/train/images
+    ann_file: /path/to/my_dataset/annotations/instances_train.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 4
+  drop_last: True
+  collate_fn:
+    type: BatchImageCollateFunction
+
+val_dataloader:
+  type: DataLoader
+  dataset:
+    type: CocoDetection
+    img_folder: /path/to/my_dataset/val/images
+    ann_file: /path/to/my_dataset/annotations/instances_val.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 4
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
+```
+
+### 2.2 Dataloader Config (Data Augmentation)
+**File**: `deim/_configs/_base/dataloader_my_dataset.yml`
+
+**Start with this simple config**, then tune based on results:
+
+```yaml
+# Data augmentation and preprocessing
+
+train_dataloader:
+  total_batch_size: 16  # Adjust based on GPU memory (8/16/32)
+  num_workers: 4
+  dataset:
+    transforms:
+      type: Compose
+      ops:
+        # Basic augmentations
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+
+        # Optional: Add domain-specific augmentations
+        # - {type: GaussianBlur, kernel_size: [3, 5], sigma: [0.1, 2.0], p: 0.3}
+        # - {type: RandomRotation, degrees: 10, p: 0.5}
+        # - {type: RandomPerspective, distortion_scale: 0.2, p: 0.3}
+        # - {type: RandomAdjustSharpness, sharpness_factor: 2, p: 0.3}
+
+        # Required preprocessing
+        - {type: Resize, size: [640, 640]}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+
+      policy:
+        name: stop_epoch
+        epoch: 180  # Stop heavy augmentations at 90% of total epochs
+        ops: ['Mosaic', 'RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
+
+  collate_fn:
+    type: BatchImageCollateFunction
+    base_size: 640
+    stop_epoch: 180
+
+  shuffle: True
+
+val_dataloader:
+  total_batch_size: 64  # Can be larger (no gradients)
+  num_workers: 4
+  dataset:
+    transforms:
+      ops:
+        - {type: Resize, size: [640, 640]}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}
+  shuffle: False
+```
+
+**💡 Augmentation Tips**:
+- **Start simple** (basic augmentations above), add more if needed
+- **Remove augmentations** that hurt your domain (e.g., rotation for circular objects)
+- **Add comments** explaining your reasoning (like the thermal example in docs)
+
+### 2.3 Main Config
+**File**: `deim/_configs/my_dataset.yml`
+
+```yaml
+# DEIM Configuration for 'my_dataset' detection
+
+__include__:
+  [
+    "./_base/dataset_my_dataset.yml",
+    "./_base/runtime.yml",
+    "./_base/dfine_hgnetv2.yml",
+    "./_base/dataloader_my_dataset.yml",
+    "./_base/optimizer.yml",
+  ]
+
+output_dir: ./deim_outputs/my_dataset
+
+# Model architecture (usually don't need to change)
+DEIM:
+  backbone: HGNetv2
+
+HGNetv2:
+  name: "B0"  # B0 (fast) or B2/B4 (accurate)
+  return_idx: [2, 3]
+  freeze_at: -1
+  freeze_norm: False
+  use_lab: True
+
+HybridEncoder:
+  in_channels: [512, 1024]
+  feat_strides: [16, 32]
+  hidden_dim: 128
+  use_encoder_idx: [1]
+  dim_feedforward: 512
+  expansion: 0.34
+  depth_mult: 0.5
+
+DFINETransformer:
+  feat_channels: [128, 128]
+  feat_strides: [16, 32]
+  hidden_dim: 128
+  dim_feedforward: 512
+  num_levels: 2
+  num_layers: 3
+  eval_idx: -1
+  num_points: [6, 6]
+
+# Training parameters
+epoches: 200  # Adjust based on dataset size
+
+optimizer:
+  type: AdamW
+  lr: 0.0008  # May need tuning
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+  params:
+    - params: "^(?=.*backbone)(?!.*norm|bn).*$"
+      lr: 0.0004
+    - params: "^(?=.*backbone)(?=.*norm|bn).*$"
+      lr: 0.0004
+      weight_decay: 0.
+    - params: "^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$"
+      weight_decay: 0.
+
+train_dataloader:
+  total_batch_size: 16  # Override if needed
+
+val_dataloader:
+  total_batch_size: 64
+```
+
+---
+
+## Step 3: Train!
+
+```python
+from deim import DEIM
+
+# Initialize with your config
+model = DEIM(config='my_dataset')
+
+# Train from scratch
+model.train(epochs=200)
+
+# Or fine-tune from pretrained weights
+# model.train(
+#     pretrained='deim_outputs/under/best_stg2.pth',
+#     epochs=100
+# )
+```
+
+Training outputs saved to: `deim_outputs/my_dataset/`
+
+---
+
+## Step 4: Run Inference
+
+```python
+# Load trained model
+model = DEIM(config='my_dataset')
+model.load('deim_outputs/my_dataset/best_stg2.pth')
+
+# Run inference (always returns list)
+results = model.predict(['image1.jpg', 'image2.jpg'], visualize=True)
+
+# Display results
+from PIL import Image
+for r in results:
+    display(Image.fromarray(r['visualization']))
+```
+
+---
+
+## Common Changes
+
+### Adjust Batch Size (Out of Memory?)
+```yaml
+# In dataloader_my_dataset.yml
+train_dataloader:
+  total_batch_size: 8  # Reduce from 16
+```
+
+### Speed vs Accuracy Trade-off
+```yaml
+# In my_dataset.yml
+HGNetv2:
+  name: "B0"  # Fastest (default)
+  # name: "B2"  # Balanced
+  # name: "B4"  # Most accurate
+```
+
+### Train Longer
+```yaml
+# In my_dataset.yml
+epoches: 300  # Increase from 200
+```
+
+### Disable Specific Augmentations
+```yaml
+# In dataloader_my_dataset.yml, comment out unwanted augmentations:
+# - {type: RandomRotation, degrees: 10, p: 0.5}  # Commented out
+```
+
+---
+
+## Troubleshooting
+
+| Issue | Solution |
+|-------|----------|
+| Out of memory | Reduce `total_batch_size` to 8 or 4 |
+| Training too slow | Reduce `num_workers` or disable heavy augmentations |
+| Poor accuracy | Train longer (300 epochs), use larger backbone (B2/B4) |
+| Class names not showing | Add `class_names` to `dataset_my_dataset.yml` |
+
+---
+
+## Next Steps
+
+- ✅ Monitor training in `deim_outputs/my_dataset/`
+- ✅ Best model saved as `best_stg2.pth`
+- ✅ See [CONFIGURATION_REFERENCE.md](CONFIGURATION_REFERENCE.md) for detailed parameter tuning
+- ✅ Check `dataloader_under.yml` for real-world augmentation examples with reasoning
+
+---
+
+**Need more details?** See [CONFIGURATION_REFERENCE.md](CONFIGURATION_REFERENCE.md) for:
+- Detailed parameter explanations
+- When to use/avoid specific augmentations
+- Use-case specific configurations
+- Performance tuning guide
diff --git a/example_usage.py b/example_usage.py
new file mode 100644
index 00000000..c9c5f8ce
--- /dev/null
+++ b/example_usage.py
@@ -0,0 +1,113 @@
+#!/home/hidara/miniconda3/envs/deim/bin/python
+"""
+Example usage of DEIM module
+
+This demonstrates how to use DEIM for training and inference
+in less than 10 lines of code.
+
+DO NOT RUN THIS SCRIPT WHILE GPU IS BUSY WITH TRAINING!
+"""
+
+from deim import DEIM
+
+
+def example_training():
+    """Example: Training from scratch"""
+
+    # Initialize model with 'under' configuration
+    model = DEIM(config='under')
+
+    # Train the model (this would run actual training on GPU)
+    # Uncomment to run:
+    # model.train(epochs=320, batch_size=8)
+
+    print("Training example prepared (not executed to avoid GPU conflict)")
+
+
+def example_training_pretrained():
+    """Example: Training with pretrained weights"""
+
+    # Initialize model
+    model = DEIM(config='sides')
+
+    # Train with pretrained weights (transfer learning)
+    # Use the best model from a previous training run
+    model.train(
+        pretrained='deim_outputs/best_models/sides/best_stg1.pth',
+        epochs=500,
+        batch_size=32
+    )
+
+    print("Transfer learning example prepared")
+
+
+def example_inference():
+    """Example: Running inference on images"""
+
+    # Initialize model
+    model = DEIM(config='under')
+
+    # Load trained weights from a completed training run
+    # model.load('deim_outputs/under/20251002_215916/best_stg2.pth')
+
+    # Run inference on single image
+    # results = model.predict('image.jpg', visualize=True)
+
+    # Run inference on multiple images
+    # results = model.predict(['img1.jpg', 'img2.jpg'])
+
+    # Run inference on video
+    # results = model.predict('video.mp4', save_path='output.mp4')
+
+    print("Inference example prepared")
+
+
+def example_custom_dataset():
+    """Example: Training on a custom dataset
+
+    For detailed instructions, see: docs/CUSTOM_DATASET_GUIDE.md
+
+    Quick steps:
+    1. Prepare dataset in COCO format
+    2. Create config files:
+       - deim/_configs/_base/dataset_my_dataset.yml
+       - deim/_configs/_base/dataloader_my_dataset.yml
+       - deim/_configs/my_dataset.yml
+    3. Train!
+    """
+
+    # Initialize model with your custom config name
+    model = DEIM(config='my_dataset')  # Uses deim/_configs/my_dataset.yml
+
+    # Train from scratch
+    model.train(epochs=100)
+
+    # Or fine-tune from pretrained weights
+    # model.train(
+    #     pretrained='deim_outputs/under/best_stg2.pth',
+    #     epochs=50
+    # )
+
+    print("Custom dataset training started!")
+    print("See docs/CUSTOM_DATASET_GUIDE.md for complete configuration guide")
+
+
+if __name__ == '__main__':
+    print("DEIM Usage Examples")
+    print("=" * 50)
+    print("\nNOTE: Examples are commented out to avoid GPU conflicts")
+    print("Uncomment the code you want to run when GPU is available\n")
+
+    # example_training()
+    example_training_pretrained()
+    # example_inference()
+    # example_custom_dataset()
+
+    print("\n✅ All examples prepared successfully!")
+    print("\nTo use DEIM in your code:")
+    print("1. from deim import DEIM")
+    print("2. model = DEIM(config='under')")
+    print("3. model.train(epochs=100)  # For training")
+    print("   OR")
+    print("3. model.load('checkpoint.pth')  # For inference")
+    print("4. results = model.predict('image.jpg')")
diff --git a/figures/teaser_a.png b/figures/teaser_a.png
deleted file mode 100644
index 4d9a952f..00000000
Binary files a/figures/teaser_a.png and /dev/null differ
diff --git a/figures/teaser_b.png b/figures/teaser_b.png
deleted file mode 100644
index 24632dd1..00000000
Binary files a/figures/teaser_b.png and /dev/null differ
diff --git a/requirements.txt b/requirements.txt
index 26cd97f2..11a4b7ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,26 @@
-torch>=2.0.1
-torchvision>=0.15.2
-faster-coco-eval>=1.6.5
-PyYAML
-tensorboard
-scipy
-calflops
-transformers
+# DEIM Requirements
+# Python 3.8+ required
+
+# Core dependencies
+torch>=2.0.0
+torchvision>=0.15.0
+numpy>=1.21.0
+opencv-python>=4.7.0
+pillow>=9.0.0
+pyyaml>=6.0
+
+# Visualization
+supervision>=0.17.0
+
+# Data handling
+pandas>=1.5.0
+scipy>=1.9.0
+matplotlib>=3.5.0
+
+# Development tools (optional)
+ruff>=0.1.0
+pyright>=1.1.0
+
+# Additional dependencies for training
+tqdm>=4.65.0
+tensorboard>=2.11.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..5070972b
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,55 @@
+"""
+Setup script for DEIM module
+"""
+
+from setuptools import setup, find_packages
+from pathlib import Path
+
+# Read README
+this_directory = Path(__file__).parent
+long_description = (this_directory / "README.md").read_text()
+
+# Read requirements
+requirements = []
+with open('requirements.txt') as f:
+    for line in f:
+        line = line.strip()
+        if line and not line.startswith('#'):
+            requirements.append(line)
+
+setup(
+    name='deim',
+    version='1.0.0',
+    author='DEIM Contributors',
+    description='DEIM - DETR with Improved Matching for object detection',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    url='https://github.com/yourusername/deim',
+    packages=find_packages(),
+    package_data={
+        'deim': [
+            '_configs/*.yml',
+            '_configs/_base/*.yml',
+        ],
+    },
+    include_package_data=True,
+    install_requires=requirements,
+    python_requires='>=3.8',
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    ],
+    keywords='object detection, deep learning, computer vision, DEIM, DETR',
+    entry_points={
+        'console_scripts': [
+            'deim=deim.api:main',
+        ],
+    },
+)
\ No newline at end of file
diff --git a/tools/benchmark/dataset.py b/tools/benchmark/dataset.py
deleted file mode 100644
index 76fa6491..00000000
--- a/tools/benchmark/dataset.py
+++ /dev/null
@@ -1,105 +0,0 @@
-"""
-Copied from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
-Copyright(c) 2023 lyuwenyu. All Rights Reserved.
-"""
-
-import os
-import glob
-from PIL import Image
-
-import torch
-import torch.utils.data as data
-import torchvision
-import torchvision.transforms as T
-import torchvision.transforms.functional as F
-
-Image.MAX_IMAGE_PIXELS = None
-
-class ToTensor(T.ToTensor):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def __call__(self, pic):
-        if isinstance(pic, torch.Tensor):
-            return pic
-        return super().__call__(pic)
-
-class PadToSize(T.Pad):
-    def __init__(self, size, fill=0, padding_mode='constant'):
-        super().__init__(0, fill, padding_mode)
-        self.size = size
-        self.fill = fill
-
-    def __call__(self, img):
-        """
-        Args:
-            img (PIL Image or Tensor): Image to be padded.
-
-        Returns:
-            PIL Image or Tensor: Padded image.
-        """
-        w, h = F.get_image_size(img)
-        padding = (0, 0, self.size[0] - w, self.size[1] - h)
-        return F.pad(img, padding, self.fill, self.padding_mode)
-
-
-class Dataset(data.Dataset):
-    def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None:
-        super().__init__()
-
-        self.device = device
-        self.size = 640
-
-        self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))
-
-        if preprocess is None:
-            self.preprocess = T.Compose([
-                    T.Resize(size=639, max_size=640),
-                    PadToSize(size=(640, 640), fill=114),
-                    ToTensor(),
-                    T.ConvertImageDtype(torch.float),
-            ])
-        else:
-            self.preprocess = preprocess
-
-    def __len__(self, ):
-        return len(self.im_path_list)
-
-    def __getitem__(self, index):
-        # im = Image.open(self.img_path_list[index]).convert('RGB')
-        im = torchvision.io.read_file(self.im_path_list[index])
-        im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device)
-        _, h, w = im.shape # c,h,w
-
-        im = self.preprocess(im)
-
-        blob = {
-            'images': im,
-            'im_shape': torch.tensor([self.size, self.size]).to(im.device),
-            'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device),
-            'orig_target_sizes': torch.tensor([w, h]).to(im.device),
-        }
-
-        return blob
-
-    @staticmethod
-    def post_process():
-        pass
-
-    @staticmethod
-    def collate_fn():
-        pass
-
-
-def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''):
-    '''show result
-    Keys:
-        'num_dets', 'det_boxes', 'det_scores', 'det_classes'
-    '''
-    for i in range(blob['image'].shape[0]):
-        det_scores = outputs['det_scores'][i]
-        det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold]
-
-        im = (blob['image'][i] * 255).to(torch.uint8)
-        im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2)
-        Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg')
diff --git a/tools/benchmark/get_info.py b/tools/benchmark/get_info.py
deleted file mode 100644
index b72efa35..00000000
--- a/tools/benchmark/get_info.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
-"""
-
-import os
-import sys
-sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
-
-import argparse
-from calflops import calculate_flops
-from engine.core import YAMLConfig
-
-import torch
-import torch.nn as nn
-
-def custom_repr(self):
-    return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}'
-original_repr = torch.Tensor.__repr__
-torch.Tensor.__repr__ = custom_repr
-
-def main(args, ):
-    """main
-    """
-    cfg = YAMLConfig(args.config, resume=None)
-    class Model_for_flops(nn.Module):
-        def __init__(self, ) -> None:
-            super().__init__()
-            self.model = cfg.model.deploy()
-
-        def forward(self, images):
-            outputs = self.model(images)
-            return outputs
-
-    model = Model_for_flops().eval()
-
-    flops, macs, _ = calculate_flops(model=model,
-                                     input_shape=(1, 3, 640, 640),
-                                     output_as_string=True,
-                                     output_precision=4)
-    params = sum(p.numel() for p in model.parameters())
-    print("Model FLOPs:%s   MACs:%s   Params:%s \n" %(flops, macs, params))
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config', '-c', default= "configs/dfine/dfine_hgnetv2_l_coco.yml", type=str)
-    args = parser.parse_args()
-
-    main(args)
diff --git a/tools/benchmark/requirements.txt b/tools/benchmark/requirements.txt
deleted file mode 100644
index 55a3c0f0..00000000
--- a/tools/benchmark/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-onnxruntime
-tensorrt
-pycuda
-calflops
-tqdm
-# onnx_graphsurgeon # for YOLOs
diff --git a/tools/benchmark/trt_benchmark.py b/tools/benchmark/trt_benchmark.py
deleted file mode 100644
index a650ac06..00000000
--- a/tools/benchmark/trt_benchmark.py
+++ /dev/null
@@ -1,207 +0,0 @@
-"""
-Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
-"""
-
-import tensorrt as trt
-import pycuda.driver as cuda
-from utils import TimeProfiler
-import numpy as np
-import os
-import time
-import torch
-
-from collections import namedtuple, OrderedDict
-import glob
-import argparse
-from dataset import Dataset
-from tqdm import tqdm
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Argument Parser Example')
-    parser.add_argument('--COCO_dir',
-                        type=str,
-                        default='/data/COCO2017/val2017',
-                        help="Directory for images to perform inference on.")
-    parser.add_argument("--engine_dir",
-                        type=str,
-                        help="Directory containing model engine files.")
-    parser.add_argument('--busy',
-                        action='store_true',
-                        help="Flag to indicate that other processes may be running.")
-    args = parser.parse_args()
-    return args
-
-class TRTInference(object):
-    def __init__(self, engine_path, device='cuda', backend='torch', max_batch_size=32, verbose=False):
-        self.engine_path = engine_path
-        self.device = device
-        self.backend = backend
-        self.max_batch_size = max_batch_size
-
-        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
-        self.engine = self.load_engine(engine_path)
-        self.context = self.engine.create_execution_context()
-        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
-        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
-        self.input_names = self.get_input_names()
-        self.output_names = self.get_output_names()
-
-        if self.backend == 'cuda':
-            self.stream = cuda.Stream()
-        self.time_profile = TimeProfiler()
-        self.time_profile_dataset = TimeProfiler()
-
-    def init(self):
-        self.dynamic = False
-
-    def load_engine(self, path):
-        trt.init_libnvinfer_plugins(self.logger, '')
-        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
-            return runtime.deserialize_cuda_engine(f.read())
-
-    def get_input_names(self):
-        names = []
-        for _, name in enumerate(self.engine):
-            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
-                names.append(name)
-        return names
-
-    def get_output_names(self):
-        names = []
-        for _, name in enumerate(self.engine):
-            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
-                names.append(name)
-        return names
-
-    def get_bindings(self, engine, context, max_batch_size=32, device=None):
-        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
-        bindings = OrderedDict()
-        for i, name in enumerate(engine):
-            shape = engine.get_tensor_shape(name)
-            dtype = trt.nptype(engine.get_tensor_dtype(name))
-
-            if shape[0] == -1:
-                dynamic = True
-                shape[0] = max_batch_size
-                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
-                    context.set_input_shape(name, shape)
-
-            if self.backend == 'cuda':
-                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
-                    data = np.random.randn(*shape).astype(dtype)
-                    ptr = cuda.mem_alloc(data.nbytes)
-                    bindings[name] = Binding(name, dtype, shape, data, ptr)
-                else:
-                    data = cuda.pagelocked_empty(trt.volume(shape), dtype)
-                    ptr = cuda.mem_alloc(data.nbytes)
-                    bindings[name] = Binding(name, dtype, shape, data, ptr)
-            else:
-                data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
-                bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
-        return bindings
-
-    def run_torch(self, blob):
-        for n in self.input_names:
-            if self.bindings[n].shape != blob[n].shape:
-                self.context.set_input_shape(n, blob[n].shape)
-                self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
-
-        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
-        self.context.execute_v2(list(self.bindings_addr.values()))
-        outputs = {n: self.bindings[n].data for n in self.output_names}
-        return outputs
-
-    def async_run_cuda(self, blob):
-        for n in self.input_names:
-            cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)
-
-        bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
-        self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)
-
-        outputs = {}
-        for n in self.output_names:
-            cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
-            outputs[n] = self.bindings[n].data
-
-        self.stream.synchronize()
-
-        return outputs
-
-    def __call__(self, blob):
-        if self.backend == 'torch':
-            return self.run_torch(blob)
-        elif self.backend == 'cuda':
-            return self.async_run_cuda(blob)
-
-    def synchronize(self):
-        if self.backend == 'torch' and torch.cuda.is_available():
-            torch.cuda.synchronize()
-        elif self.backend == 'cuda':
-            self.stream.synchronize()
-
-    def warmup(self, blob, n):
-        for _ in range(n):
-            _ = self(blob)
-
-    def speed(self, blob, n, nonempty_process=False):
-        times = []
-        self.time_profile_dataset.reset()
-        for i in tqdm(range(n), desc="Running Inference", unit="iteration"):
-            self.time_profile.reset()
-            with self.time_profile_dataset:
-                img = blob[i]
-                if img['images'] is not None:
-                    img['image'] = img['input'] = img['images'].unsqueeze(0)
-                else:
-                    img['images'] = img['input'] = img['image'].unsqueeze(0)
-            with self.time_profile:
-                _ = self(img)
-            times.append(self.time_profile.total)
-
-        # end-to-end model only
-        times = sorted(times)
-        if len(times) > 100 and nonempty_process:
-            times = times[:100]
-
-        avg_time = sum(times) / len(times)  # Calculate the average of the remaining times
-        return avg_time
-
-def main():
-    FLAGS = parse_args()
-    dataset = Dataset(FLAGS.infer_dir)
-    im = torch.ones(1, 3, 640, 640).cuda()
-    blob = {
-            'image': im,
-            'images': im,
-            'input': im,
-            'im_shape': torch.tensor([640, 640]).to(im.device),
-            'scale_factor': torch.tensor([1, 1]).to(im.device),
-            'orig_target_sizes': torch.tensor([640, 640]).to(im.device),
-        }
-
-    engine_files = glob.glob(os.path.join(FLAGS.models_dir, "*.engine"))
-    results = []
-
-    for engine_file in engine_files:
-        print(f"Testing engine: {engine_file}")
-        model = TRTInference(engine_file, max_batch_size=1, verbose=False)
-        model.init()
-        model.warmup(blob, 1000)
-        t = []
-        for _ in range(1):
-            t.append(model.speed(dataset, 1000, FLAGS.busy))
-        avg_latency = 1000 * torch.tensor(t).mean()
-        results.append((engine_file, avg_latency))
-        print(f"Engine: {engine_file}, Latency: {avg_latency:.2f} ms")
-
-        del model
-        torch.cuda.empty_cache()
-        time.sleep(1)
-
-    sorted_results = sorted(results, key=lambda x: x[1])
-    for engine_file, latency in sorted_results:
-        print(f"Engine: {engine_file}, Latency: {latency:.2f} ms")
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/benchmark/utils.py b/tools/benchmark/utils.py
deleted file mode 100644
index 23e1800a..00000000
--- a/tools/benchmark/utils.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import time
-import contextlib
-import numpy as np
-from PIL import Image
-from collections import OrderedDict
-
-import onnx
-import torch
-import onnx_graphsurgeon
-
-
-def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'):
-    '''--loadInputs='image:input_tensor.bin'
-    '''
-    im = Image.open(path).resize(size)
-    data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255.
-    data.tofile(output_name)
-
-
-def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False):
-    '''
-    http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html
-    https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py
-    '''
-    onnx_model = onnx.load(path)
-
-    if simplify:
-        from onnxsim import simplify
-        onnx_model, _ = simplify(onnx_model,  overwrite_input_shapes={'image': [1, 3, 640, 640]})
-
-    graph = onnx_graphsurgeon.import_onnx(onnx_model)
-    graph.toposort()
-    graph.fold_constants()
-    graph.cleanup()
-
-    topk = max_output_boxes
-    attrs = OrderedDict(plugin_version='1',
-                        background_class=-1,
-                        max_output_boxes=topk,
-                        score_threshold=score_threshold,
-                        iou_threshold=iou_threshold,
-                        score_activation=False,
-                        box_coding=0, )
-
-    outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]),
-               onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]),
-               onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]),
-               onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])]
-
-    graph.layer(op='EfficientNMS_TRT',
-                name="batched_nms",
-                inputs=[graph.outputs[0],
-                        graph.outputs[1]],
-                outputs=outputs,
-                attrs=attrs, )
-
-    graph.outputs = outputs
-    graph.cleanup().toposort()
-
-    onnx.save(onnx_graphsurgeon.export_onnx(graph), 'yolo_w_nms.onnx')
-
-
-class TimeProfiler(contextlib.ContextDecorator):
-    def __init__(self, ):
-        self.total = 0
-
-    def __enter__(self, ):
-        self.start = self.time()
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self.total += self.time() - self.start
-
-    def reset(self, ):
-        self.total = 0
-
-    def time(self, ):
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        return time.time()
diff --git a/tools/dataset/remap_obj365.py b/tools/dataset/remap_obj365.py
deleted file mode 100644
index f76214e7..00000000
--- a/tools/dataset/remap_obj365.py
+++ /dev/null
@@ -1,139 +0,0 @@
-"""
-Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
-"""
-
-import json
-import os
-import argparse
-
-
-def update_image_paths(images, new_prefix):
-    print('Updating image paths with new prefix...')
-    for img in images:
-        split = img['file_name'].split('/')[1:]
-        img['file_name'] = os.path.join(new_prefix, *split)
-    print('Image paths updated.')
-    return images
-
-def create_split_annotations(original_annotations, split_image_ids, new_prefix, output_file):
-    print(f'Creating split annotations for {output_file}...')
-    new_images = [img for img in original_annotations['images'] if img['id'] in split_image_ids]
-    print(f'Number of images selected: {len(new_images)}')
-    if new_prefix is not None:
-        new_images = update_image_paths(new_images, new_prefix)
-
-    new_annotations = {
-        'images': new_images,
-        'annotations': [ann for ann in original_annotations['annotations'] if ann['image_id'] in split_image_ids],
-        'categories': original_annotations['categories']
-    }
-    print(f'Number of annotations selected: {len(new_annotations["annotations"])}')
-    with open(output_file, 'w') as f:
-        json.dump(new_annotations, f)
-    print(f'Annotations saved to {output_file}')
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description='Split and update dataset annotations.')
-    parser.add_argument(
-        '--base_dir',
-        type=str,
-        default='/datassd/objects365',
-        help='Base directory of the dataset, e.g., /data/Objects365/data'
-    )
-    parser.add_argument(
-        '--new_val_size',
-        type=int,
-        default=5000,
-        help='Number of images to include in the new validation set (default: 5000)'
-    )
-    parser.add_argument(
-        '--output_suffix',
-        type=str,
-        default='new',
-        help='Suffix to add to new annotation files (default: new)'
-    )
-    return parser.parse_args()
-
-def main():
-    args = parse_arguments()
-    base_dir = args.base_dir
-    new_val_size = args.new_val_size
-    output_suffix = args.output_suffix
-
-    # Define paths based on the base directory
-    original_train_ann_file = os.path.join(base_dir, 'train', 'zhiyuan_objv2_train.json')
-    original_val_ann_file = os.path.join(base_dir, 'val', 'zhiyuan_objv2_val.json')
-
-    new_val_ann_file = os.path.join(base_dir, 'val', f'{output_suffix}_zhiyuan_objv2_val.json')
-    new_train_ann_file = os.path.join(base_dir, 'train', f'{output_suffix}_zhiyuan_objv2_train.json')
-
-    # Check if original annotation files exist
-    if not os.path.isfile(original_train_ann_file):
-        print(f'Error: Training annotation file not found at {original_train_ann_file}')
-        return
-    if not os.path.isfile(original_val_ann_file):
-        print(f'Error: Validation annotation file not found at {original_val_ann_file}')
-        return
-
-    # Load the original training and validation annotations
-    print('Loading original training annotations...')
-    with open(original_train_ann_file, 'r') as f:
-        train_annotations = json.load(f)
-    print('Training annotations loaded.')
-
-    print('Loading original validation annotations...')
-    with open(original_val_ann_file, 'r') as f:
-        val_annotations = json.load(f)
-    print('Validation annotations loaded.')
-
-    # Extract image IDs from the original validation set
-    print('Extracting image IDs from the validation set...')
-    val_image_ids = [img['id'] for img in val_annotations['images']]
-    print(f'Total validation images: {len(val_image_ids)}')
-
-    # Split image IDs for the new training and validation sets
-    print(f'Splitting validation images into new validation set of size {new_val_size} and training set...')
-    new_val_image_ids = val_image_ids[:new_val_size]
-    new_train_image_ids = val_image_ids[new_val_size:]
-    print(f'New validation set size: {len(new_val_image_ids)}')
-    print(f'New training set size from validation images: {len(new_train_image_ids)}')
-
-    # Create new validation annotation file
-    print('Creating new validation annotations...')
-    create_split_annotations(val_annotations, new_val_image_ids, None, new_val_ann_file)
-    print('New validation annotations created.')
-
-    # Combine the remaining validation images and annotations with the original training data
-    print('Preparing new training images and annotations...')
-    new_train_images = [img for img in val_annotations['images'] if img['id'] in new_train_image_ids]
-    print(f'Number of images from validation to add to training: {len(new_train_images)}')
-    new_train_images = update_image_paths(new_train_images, 'images_from_val')
-    new_train_annotations = [ann for ann in val_annotations['annotations'] if ann['image_id'] in new_train_image_ids]
-    print(f'Number of annotations from validation to add to training: {len(new_train_annotations)}')
-
-    # Add the original training images and annotations
-    print('Adding original training images and annotations...')
-    new_train_images.extend(train_annotations['images'])
-    new_train_annotations.extend(train_annotations['annotations'])
-    print(f'Total training images: {len(new_train_images)}')
-    print(f'Total training annotations: {len(new_train_annotations)}')
-
-    # Create a new training annotation dictionary
-    print('Creating new training annotations dictionary...')
-    new_train_annotations_dict = {
-        'images': new_train_images,
-        'annotations': new_train_annotations,
-        'categories': train_annotations['categories']
-    }
-    print('New training annotations dictionary created.')
-
-    # Save the new training annotations
-    print('Saving new training annotations...')
-    with open(new_train_ann_file, 'w') as f:
-        json.dump(new_train_annotations_dict, f)
-    print(f'New training annotations saved to {new_train_ann_file}')
-
-    print('Processing completed successfully.')
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/dataset/resize_obj365.py b/tools/dataset/resize_obj365.py
deleted file mode 100644
index 0396120a..00000000
--- a/tools/dataset/resize_obj365.py
+++ /dev/null
@@ -1,148 +0,0 @@
-"""
-Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
-"""
-
-import os
-import json
-from PIL import Image
-from concurrent.futures import ThreadPoolExecutor
-import argparse
-
-
-def resize_image_and_update_annotations(image_path, annotations, max_size=640):
-    print(f"Processing image: {image_path}")
-    try:
-        with Image.open(image_path) as img:
-            w, h = img.size
-            if max(w, h) <= max_size:
-                return annotations, w, h, False  # No need to resize
-
-            scale = max_size / max(w, h)
-            new_w = int(w * scale)
-            new_h = int(h * scale)
-            print(f"Resizing image to width={new_w}, height={new_h}")
-
-            img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
-            # TODO
-            new_image_path = image_path.replace('.jpg', '_resized{}.jpg'.format(max_size))
-            img.save(new_image_path)
-            print(f"Resized image saved: {new_image_path}")
-            print(f"Original size: ({w}, {h}), New size: ({new_w}, {new_h})")
-
-            # Update annotations
-            for ann in annotations:
-                ann['area'] = ann['area'] * (scale ** 2)
-                ann['bbox'] = [coord * scale for coord in ann['bbox']]
-                if 'orig_size' in ann:
-                    ann['orig_size'] = (new_w, new_h)
-                if 'size' in ann:
-                    ann['size'] = (new_w, new_h)
-
-    except Exception as e:
-        print(f"Error processing {image_path}: {e}")
-        return None
-
-    return annotations, new_w, new_h, True
-
-def resize_images_and_update_annotations(base_dir, subset, max_size=640, num_workers=4):
-    print(f"Starting to resize images and update annotations for subset: {subset}")
-    json_file = os.path.join(base_dir, subset, 'new_zhiyuan_objv2_{}.json'.format(subset))
-    if not os.path.isfile(json_file):
-        print(f'Error: JSON file not found at {json_file}')
-        return
-
-    print(f"Loading JSON file: {json_file}")
-    with open(json_file, 'r') as f:
-        data = json.load(f)
-    print("JSON file loaded.")
-
-    print("Preparing image annotations mapping...")
-    image_annotations = {img['id']: [] for img in data['images']}
-    for ann in data['annotations']:
-        image_annotations[ann['image_id']].append(ann)
-    print("Image annotations mapping prepared.")
-
-    def process_image(image_info):
-        image_path = os.path.join(base_dir, subset, image_info['file_name'])
-        results = resize_image_and_update_annotations(image_path, image_annotations[image_info['id']], max_size)
-        if results is None:
-            updated_annotations, new_w, new_h, resized = None, None, None, None
-        else:
-            updated_annotations, new_w, new_h, resized = results
-        return image_info, updated_annotations, new_w, new_h, resized
-
-    print(f"Processing images with {num_workers} worker threads...")
-    with ThreadPoolExecutor(max_workers=num_workers) as executor:
-        results = list(executor.map(process_image, data['images']))
-    print("Image processing completed.")
-
-    new_images = []
-    new_annotations = []
-
-    print("Updating image and annotation data...")
-    for image_info, updated_annotations, new_w, new_h, resized in results:
-        if updated_annotations is not None:
-            image_info['width'] = new_w
-            image_info['height'] = new_h
-            image_annotations[image_info['id']] = updated_annotations
-            if resized:
-                image_info['file_name'] = image_info['file_name'].replace('.jpg', '_resized{}.jpg'.format(max_size))
-            new_images.append(image_info)
-            new_annotations.extend(updated_annotations)
-    print(f"Total images processed: {len(new_images)}")
-    print(f"Total annotations updated: {len(new_annotations)}")
-
-    new_data = {
-        'images': new_images,
-        'annotations': new_annotations,
-        'categories': data['categories']
-    }
-
-    new_json_file = json_file.replace('.json', '_resized{}.json'.format(max_size))
-    print('Saving new training annotations...')
-    with open(new_json_file, 'w') as f:
-        json.dump(new_data, f)
-    print(f'New JSON file saved to {new_json_file}')
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description='Resize images and update dataset annotations for both train and val sets.')
-    parser.add_argument(
-        '--base_dir',
-        type=str,
-        default='/datassd/objects365',
-        help='Base directory of the dataset, e.g., /data/Objects365/data'
-    )
-    parser.add_argument(
-        '--max_size',
-        type=int,
-        default=640,
-        help='Maximum size for the longer side of the image (default: 640)'
-    )
-    parser.add_argument(
-        '--num_workers',
-        type=int,
-        default=4,
-        help='Number of worker threads for parallel processing (default: 4)'
-    )
-    args = parser.parse_args()
-    return args
-
-def main():
-    args = parse_arguments()
-    base_dir = args.base_dir
-    max_size = args.max_size
-    num_workers = args.num_workers
-
-    subsets = ['train', 'val']
-    for subset in subsets:
-        print(f'Processing subset: {subset}')
-        resize_images_and_update_annotations(
-            base_dir=base_dir,
-            subset=subset,
-            max_size=max_size,
-            num_workers=num_workers
-        )
-    print("All subsets processed.")
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/deployment/export_onnx.py b/tools/deployment/export_onnx.py
deleted file mode 100644
index 4d60ae74..00000000
--- a/tools/deployment/export_onnx.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""
-D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement
-Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
----------------------------------------------------------------------------------
-Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
-Copyright (c) 2023 lyuwenyu. All Rights Reserved.
-"""
-
-import os
-import sys
-sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
-
-import torch
-import torch.nn as nn
-
-from engine.core import YAMLConfig
-
-
-def main(args, ):
-    """main
-    """
-    cfg = YAMLConfig(args.config, resume=args.resume)
-
-    if 'HGNetv2' in cfg.yaml_cfg:
-        cfg.yaml_cfg['HGNetv2']['pretrained'] = False
-
-    if args.resume:
-        checkpoint = torch.load(args.resume, map_location='cpu')
-        if 'ema' in checkpoint:
-            state = checkpoint['ema']['module']
-        else:
-            state = checkpoint['model']
-
-        # NOTE load train mode state -> convert to deploy mode
-        cfg.model.load_state_dict(state)
-
-    else:
-        # raise AttributeError('Only support resume to load model.state_dict by now.')
-        print('not load model.state_dict, use default init state dict...')
-
-    class Model(nn.Module):
-        def __init__(self, ) -> None:
-            super().__init__()
-            self.model = cfg.model.deploy()
-            self.postprocessor = cfg.postprocessor.deploy()
-
-        def forward(self, images, orig_target_sizes):
-            outputs = self.model(images)
-            outputs = self.postprocessor(outputs, orig_target_sizes)
-            return outputs
-
-    model = Model()
-
-    data = torch.rand(32, 3, 640, 640)
-    size = torch.tensor([[640, 640]])
-    _ = model(data, size)
-
-    dynamic_axes = {
-        'images': {0: 'N', },
-        'orig_target_sizes': {0: 'N'}
-    }
-
-    output_file = args.resume.replace('.pth', '.onnx') if args.resume else 'model.onnx'
-
-    torch.onnx.export(
-        model,
-        (data, size),
-        output_file,
-        input_names=['images', 'orig_target_sizes'],
-        output_names=['labels', 'boxes', 'scores'],
-        dynamic_axes=dynamic_axes,
-        opset_version=16,
-        verbose=False,
-        do_constant_folding=True,
-    )
-
-    if args.check:
-        import onnx
-        onnx_model = onnx.load(output_file)
-        onnx.checker.check_model(onnx_model)
-        print('Check export onnx model done...')
-
-    if args.simplify:
-        import onnx
-        import onnxsim
-        dynamic = True
-        # input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None
-        input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None
-        onnx_model_simplify, check = onnxsim.simplify(output_file, test_input_shapes=input_shapes)
-        onnx.save(onnx_model_simplify, output_file)
-        print(f'Simplify onnx model {check}...')
-
-
-if __name__ == '__main__':
-
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config', '-c', default='configs/dfine/dfine_hgnetv2_l_coco.yml', type=str, )
-    parser.add_argument('--resume', '-r', type=str, )
-    parser.add_argument('--check',  action='store_true', default=True,)
-    parser.add_argument('--simplify',  action='store_true', default=True,)
-    args = parser.parse_args()
-    main(args)
diff --git a/tools/deployment/export_yolo_w_nms.py b/tools/deployment/export_yolo_w_nms.py
deleted file mode 100644
index 95c89b21..00000000
--- a/tools/deployment/export_yolo_w_nms.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import torch
-import torchvision
-
-import numpy as np
-import onnxruntime as ort
-
-from utils import yolo_insert_nms
-
-class YOLO11(torch.nn.Module):
-    def __init__(self, name) -> None:
-        super().__init__()
-        from ultralytics import YOLO
-        # Load a model
-        # build a new model from scratch
-        # model = YOLO(f'{name}.yaml')
-
-        # load a pretrained model (recommended for training)
-        model = YOLO("yolo11n.pt")
-        self.model = model.model
-
-    def forward(self, x):
-        '''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216
-        '''
-        pred: torch.Tensor = self.model(x)[0] # n 84 8400,
-        pred = pred.permute(0, 2, 1)
-        boxes, scores = pred.split([4, 80], dim=-1)
-        boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
-
-        return boxes, scores
-
-
-
-def export_onnx(name='yolov8n'):
-    '''export onnx
-    '''
-    m = YOLO11(name)
-
-    x = torch.rand(1, 3, 640, 640)
-    dynamic_axes = {
-        'image': {0: '-1'}
-    }
-    torch.onnx.export(m, x, f'{name}.onnx',
-                      input_names=['image'],
-                      output_names=['boxes', 'scores'],
-                      opset_version=13,
-                      dynamic_axes=dynamic_axes)
-
-    data = np.random.rand(1, 3, 640, 640).astype(np.float32)
-    sess = ort.InferenceSession(f'{name}.onnx')
-    _ = sess.run(output_names=None, input_feed={'image': data})
-
-    import onnx
-    import onnxslim
-    model_onnx = onnx.load(f'{name}.onnx')
-    model_onnx = onnxslim.slim(model_onnx)
-    onnx.save(model_onnx, f'{name}.onnx')
-
-
-if __name__ == '__main__':
-
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--name', type=str, default='yolo11n_tuned')
-    parser.add_argument('--score_threshold', type=float, default=0.01)
-    parser.add_argument('--iou_threshold', type=float, default=0.6)
-    parser.add_argument('--max_output_boxes', type=int, default=300)
-    args = parser.parse_args()
-
-    export_onnx(name=args.name)
-
-    yolo_insert_nms(path=f'{args.name}.onnx',
-                    score_threshold=args.score_threshold,
-                    iou_threshold=args.iou_threshold,
-                    max_output_boxes=args.max_output_boxes, )
diff --git a/tools/inference/onnx_inf.py b/tools/inference/onnx_inf.py
deleted file mode 100644
index 05730920..00000000
--- a/tools/inference/onnx_inf.py
+++ /dev/null
@@ -1,162 +0,0 @@
-"""
-Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
-"""
-
-import torch
-import torchvision.transforms as T
-import numpy as np
-import onnxruntime as ort
-from PIL import Image, ImageDraw
-import cv2
-
-
-def resize_with_aspect_ratio(image, size, interpolation=Image.BILINEAR):
-    """Resizes an image while maintaining aspect ratio and pads it."""
-    original_width, original_height = image.size
-    ratio = min(size / original_width, size / original_height)
-    new_width = int(original_width * ratio)
-    new_height = int(original_height * ratio)
-    image = image.resize((new_width, new_height), interpolation)
-
-    # Create a new image with the desired size and paste the resized image onto it
-    new_image = Image.new("RGB", (size, size))
-    new_image.paste(image, ((size - new_width) // 2, (size - new_height) // 2))
-    return new_image, ratio, (size - new_width) // 2, (size - new_height) // 2
-
-
-def draw(images, labels, boxes, scores, ratios, paddings, thrh=0.4):
-    result_images = []
-    for i, im in enumerate(images):
-        draw = ImageDraw.Draw(im)
-        scr = scores[i]
-        lab = labels[i][scr > thrh]
-        box = boxes[i][scr > thrh]
-        scr = scr[scr > thrh]
-
-        ratio = ratios[i]
-        pad_w, pad_h = paddings[i]
-
-        for lbl, bb in zip(lab, box):
-            # Adjust bounding boxes according to the resizing and padding
-            bb = [
-                (bb[0] - pad_w) / ratio,
-                (bb[1] - pad_h) / ratio,
-                (bb[2] - pad_w) / ratio,
-                (bb[3] - pad_h) / ratio,
-            ]
-            draw.rectangle(bb, outline='red')
-            draw.text((bb[0], bb[1]), text=str(lbl), fill='blue')
-
-        result_images.append(im)
-    return result_images
-
-
-def process_image(sess, im_pil):
-    # Resize image while preserving aspect ratio
-    resized_im_pil, ratio, pad_w, pad_h = resize_with_aspect_ratio(im_pil, 640)
-    orig_size = torch.tensor([[resized_im_pil.size[1], resized_im_pil.size[0]]])
-
-    transforms = T.Compose([
-        T.ToTensor(),
-    ])
-    im_data = transforms(resized_im_pil).unsqueeze(0)
-
-    output = sess.run(
-        output_names=None,
-        input_feed={'images': im_data.numpy(), "orig_target_sizes": orig_size.numpy()}
-    )
-
-    labels, boxes, scores = output
-
-    result_images = draw(
-        [im_pil], labels, boxes, scores,
-        [ratio], [(pad_w, pad_h)]
-    )
-    result_images[0].save('onnx_result.jpg')
-    print("Image processing complete. Result saved as 'result.jpg'.")
-
-
-def process_video(sess, video_path):
-    cap = cv2.VideoCapture(video_path)
-
-    # Get video properties
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-
-    # Define the codec and create VideoWriter object
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter('onnx_result.mp4', fourcc, fps, (orig_w, orig_h))
-
-    frame_count = 0
-    print("Processing video frames...")
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret:
-            break
-
-        # Convert frame to PIL image
-        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-
-        # Resize frame while preserving aspect ratio
-        resized_frame_pil, ratio, pad_w, pad_h = resize_with_aspect_ratio(frame_pil, 640)
-        orig_size = torch.tensor([[resized_frame_pil.size[1], resized_frame_pil.size[0]]])
-
-        transforms = T.Compose([
-            T.ToTensor(),
-        ])
-        im_data = transforms(resized_frame_pil).unsqueeze(0)
-
-        output = sess.run(
-            output_names=None,
-            input_feed={'images': im_data.numpy(), "orig_target_sizes": orig_size.numpy()}
-        )
-
-        labels, boxes, scores = output
-
-        # Draw detections on the original frame
-        result_images = draw(
-            [frame_pil], labels, boxes, scores,
-            [ratio], [(pad_w, pad_h)]
-        )
-        frame_with_detections = result_images[0]
-
-        # Convert back to OpenCV image
-        frame = cv2.cvtColor(np.array(frame_with_detections), cv2.COLOR_RGB2BGR)
-
-        # Write the frame
-        out.write(frame)
-        frame_count += 1
-
-        if frame_count % 10 == 0:
-            print(f"Processed {frame_count} frames...")
-
-    cap.release()
-    out.release()
-    print("Video processing complete. Result saved as 'result.mp4'.")
-
-
-def main(args):
-    """Main function."""
-    # Load the ONNX model
-    sess = ort.InferenceSession(args.onnx)
-    print(f"Using device: {ort.get_device()}")
-
-    input_path = args.input
-
-    try:
-        # Try to open the input as an image
-        im_pil = Image.open(input_path).convert('RGB')
-        process_image(sess, im_pil)
-    except IOError:
-        # Not an image, process as video
-        process_video(sess, input_path)
-
-
-if __name__ == '__main__':
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--onnx', type=str, required=True, help='Path to the ONNX model file.')
-    parser.add_argument('--input', type=str, required=True, help='Path to the input image or video file.')
-    args = parser.parse_args()
-    main(args)
diff --git a/tools/inference/requirements.txt b/tools/inference/requirements.txt
deleted file mode 100644
index 46a470c8..00000000
--- a/tools/inference/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-onnxruntime
-tensorrt
diff --git a/tools/inference/torch_inf.py b/tools/inference/torch_inf.py
deleted file mode 100644
index 5103ad8a..00000000
--- a/tools/inference/torch_inf.py
+++ /dev/null
@@ -1,158 +0,0 @@
-"""
-Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
-"""
-
-import torch
-import torch.nn as nn
-import torchvision.transforms as T
-
-import numpy as np
-from PIL import Image, ImageDraw
-
-import sys
-import os
-import cv2  # Added for video processing
-
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
-from engine.core import YAMLConfig
-
-
-def draw(images, labels, boxes, scores, thrh=0.4):
-    for i, im in enumerate(images):
-        draw = ImageDraw.Draw(im)
-
-        scr = scores[i]
-        lab = labels[i][scr > thrh]
-        box = boxes[i][scr > thrh]
-        scrs = scr[scr > thrh]
-
-        for j, b in enumerate(box):
-            draw.rectangle(list(b), outline='red')
-            draw.text((b[0], b[1]), text=f"{lab[j].item()} {round(scrs[j].item(), 2)}", fill='blue', )
-
-        im.save('torch_results.jpg')
-
-
-def process_image(model, device, file_path):
-    im_pil = Image.open(file_path).convert('RGB')
-    w, h = im_pil.size
-    orig_size = torch.tensor([[w, h]]).to(device)
-
-    transforms = T.Compose([
-        T.Resize((640, 640)),
-        T.ToTensor(),
-    ])
-    im_data = transforms(im_pil).unsqueeze(0).to(device)
-
-    output = model(im_data, orig_size)
-    labels, boxes, scores = output
-
-    draw([im_pil], labels, boxes, scores)
-
-
-def process_video(model, device, file_path):
-    cap = cv2.VideoCapture(file_path)
-
-    # Get video properties
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-
-    # Define the codec and create VideoWriter object
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter('torch_results.mp4', fourcc, fps, (orig_w, orig_h))
-
-    transforms = T.Compose([
-        T.Resize((640, 640)),
-        T.ToTensor(),
-    ])
-
-    frame_count = 0
-    print("Processing video frames...")
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret:
-            break
-
-        # Convert frame to PIL image
-        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-
-        w, h = frame_pil.size
-        orig_size = torch.tensor([[w, h]]).to(device)
-
-        im_data = transforms(frame_pil).unsqueeze(0).to(device)
-
-        output = model(im_data, orig_size)
-        labels, boxes, scores = output
-
-        # Draw detections on the frame
-        draw([frame_pil], labels, boxes, scores)
-
-        # Convert back to OpenCV image
-        frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
-
-        # Write the frame
-        out.write(frame)
-        frame_count += 1
-
-        if frame_count % 10 == 0:
-            print(f"Processed {frame_count} frames...")
-
-    cap.release()
-    out.release()
-    print("Video processing complete. Result saved as 'results_video.mp4'.")
-
-
-def main(args):
-    """Main function"""
-    cfg = YAMLConfig(args.config, resume=args.resume)
-
-    if 'HGNetv2' in cfg.yaml_cfg:
-        cfg.yaml_cfg['HGNetv2']['pretrained'] = False
-
-    if args.resume:
-        checkpoint = torch.load(args.resume, map_location='cpu')
-        if 'ema' in checkpoint:
-            state = checkpoint['ema']['module']
-        else:
-            state = checkpoint['model']
-    else:
-        raise AttributeError('Only support resume to load model.state_dict by now.')
-
-    # Load train mode state and convert to deploy mode
-    cfg.model.load_state_dict(state)
-
-    class Model(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = cfg.model.deploy()
-            self.postprocessor = cfg.postprocessor.deploy()
-
-        def forward(self, images, orig_target_sizes):
-            outputs = self.model(images)
-            outputs = self.postprocessor(outputs, orig_target_sizes)
-            return outputs
-
-    device = args.device
-    model = Model().to(device)
-
-    # Check if the input file is an image or a video
-    file_path = args.input
-    if os.path.splitext(file_path)[-1].lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
-        # Process as image
-        process_image(model, device, file_path)
-        print("Image processing complete.")
-    else:
-        # Process as video
-        process_video(model, device, file_path)
-
-
-if __name__ == '__main__':
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-c', '--config', type=str, required=True)
-    parser.add_argument('-r', '--resume', type=str, required=True)
-    parser.add_argument('-i', '--input', type=str, required=True)
-    parser.add_argument('-d', '--device', type=str, default='cpu')
-    args = parser.parse_args()
-    main(args)
diff --git a/tools/inference/torch_inf_vis.py b/tools/inference/torch_inf_vis.py
deleted file mode 100644
index 7a30d0e1..00000000
--- a/tools/inference/torch_inf_vis.py
+++ /dev/null
@@ -1,223 +0,0 @@
-"""
-Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
-"""
-
-import torch
-import torch.nn as nn
-import torchvision.transforms as T
-
-import numpy as np
-from PIL import Image, ImageDraw, ImageFont
-
-import sys
-import os
-import cv2  # Added for video processing
-import random
-import matplotlib.pyplot as plt
-
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
-from engine.core import YAMLConfig
-
-
-label_map = {
-    1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorbike', 5: 'aeroplane',
-    6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'trafficlight',
-    11: 'firehydrant', 12: 'streetsign', 13: 'stopsign', 14: 'parkingmeter',
-    15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse',
-    20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra',
-    25: 'giraffe', 26: 'hat', 27: 'backpack', 28: 'umbrella', 29: 'shoe',
-    30: 'eyeglasses', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee',
-    35: 'skis', 36: 'snowboard', 37: 'sportsball', 38: 'kite', 39: 'baseballbat',
-    40: 'baseballglove', 41: 'skateboard', 42: 'surfboard', 43: 'tennisracket',
-    44: 'bottle', 45: 'plate', 46: 'wineglass', 47: 'cup', 48: 'fork',
-    49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple',
-    54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hotdog',
-    59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'sofa',
-    64: 'pottedplant', 65: 'bed', 66: 'mirror', 67: 'diningtable', 68: 'window',
-    69: 'desk', 70: 'toilet', 71: 'door', 72: 'tv', 73: 'laptop',
-    74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'cellphone', 78: 'microwave',
-    79: 'oven', 80: 'toaster', 81: 'sink', 82: 'refrigerator', 83: 'blender',
-    84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddybear',
-    89: 'hairdrier', 90: 'toothbrush', 91: 'hairbrush'
-}
-
-# 自动生成颜色（使用 matplotlib 的配色方案）
-COLORS = plt.cm.tab20.colors  # 使用 20 种独特颜色，适合 CVPR 论文
-COLOR_MAP = {label: tuple([int(c * 255) for c in COLORS[i % len(COLORS)]]) for i, label in enumerate(label_map)}
-
-
-# 绘制函数
-def draw(image, labels, boxes, scores, thrh=0.5):
-    draw = ImageDraw.Draw(image)
-    font = ImageFont.load_default()  # 可替换为更高质量的字体文件路径
-    labels, boxes, scores = labels[scores > thrh], boxes[scores > thrh], scores[scores > thrh]
-
-    for j, box in enumerate(boxes):
-        category = labels[j].item()
-        color = COLOR_MAP.get(category, (255, 255, 255))  # 默认白色
-        box = list(map(int, box))
-
-        # 画边框
-        draw.rectangle(box, outline=color, width=3)
-        
-        # 添加标签和置信度
-        text = f"{label_map[category]} {scores[j].item():.2f}"
-        text_bbox = draw.textbbox((0, 0), text, font=font)  # 获取文本边界框
-        text_width, text_height = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
-        # 添加文本背景
-        text_background = [box[0], box[1] - text_height - 2, box[0] + text_width + 4, box[1]]
-        draw.rectangle(text_background, fill=color)
-        # 绘制文本
-        draw.text((box[0] + 2, box[1] - text_height - 2), text, fill="black", font=font)
-
-    return image
-
-
-def process_image(model, file_path):
-    im_pil = Image.open(file_path).convert('RGB')
-    w, h = im_pil.size
-    orig_size = torch.tensor([[w, h]]).cuda()
-
-    transforms = T.Compose([
-        T.Resize((640, 640)),
-        T.ToTensor(),
-    ])
-    im_data = transforms(im_pil).unsqueeze(0).cuda()
-
-    output = model(im_data, orig_size)
-
-    draw([im_pil], output)
-
-
-def process_video(model, file_path):
-    cap = cv2.VideoCapture(file_path)
-
-    # Get video properties
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-
-    # Define the codec and create VideoWriter object
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter('torch_results.mp4', fourcc, fps, (orig_w, orig_h))
-
-    transforms = T.Compose([
-        T.Resize((640, 640)),
-        T.ToTensor(),
-    ])
-
-    frame_count = 0
-    print("Processing video frames...")
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret:
-            break
-
-        # Convert frame to PIL image
-        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-
-        w, h = frame_pil.size
-        orig_size = torch.tensor([[w, h]]).cuda()
-
-        im_data = transforms(frame_pil).unsqueeze(0).cuda()
-
-        output = model(im_data, orig_size)
-        labels, boxes, scores = output
-
-        # Draw detections on the frame
-        draw([frame_pil], labels, boxes, scores)
-
-        # Convert back to OpenCV image
-        frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
-
-        # Write the frame
-        out.write(frame)
-        frame_count += 1
-
-        if frame_count % 10 == 0:
-            print(f"Processed {frame_count} frames...")
-
-    cap.release()
-    out.release()
-    print("Video processing complete. Result saved as 'results_video.mp4'.")
-
-def process_dataset(model, dataset_path, output_path, thrh=0.5):
-    os.makedirs(output_path, exist_ok=True)
-    image_paths = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith(('.jpg', '.png'))]
-    
-    transforms = T.Compose([
-        T.Resize((640, 640)),
-        T.ToTensor(),
-    ])
-    
-    print(f"Found {len(image_paths)} images in validation set...")
-    for idx, file_path in enumerate(image_paths):
-        im_pil = Image.open(file_path).convert('RGB')
-        w, h = im_pil.size
-        orig_size = torch.tensor([[w, h]]).cuda()
-
-        # 图像预处理
-        im_data = transforms(im_pil).unsqueeze(0).cuda()
-        output = model(im_data, orig_size)
-        labels, boxes, scores = output[0]['labels'], output[0]['boxes'], output[0]['scores']
-        
-        # 绘制结果
-        vis_image = draw(im_pil.copy(), labels, boxes, scores, thrh)
-        save_path = os.path.join(output_path, f"vis_{os.path.basename(file_path)}")
-        vis_image.save(save_path)
-
-        if idx % 500 == 0:
-            print(f"Processed {idx}/{len(image_paths)} images...")
-
-    print("Visualization complete. Results saved in:", output_path)
-
-
-def main(args):
-    """Main function"""
-    cfg = YAMLConfig(args.config, resume=args.resume)
-
-    if 'HGNetv2' in cfg.yaml_cfg:
-        cfg.yaml_cfg['HGNetv2']['pretrained'] = False
-
-    if args.resume:
-        checkpoint = torch.load(args.resume, map_location='cpu')
-        if 'ema' in checkpoint:
-            state = checkpoint['ema']['module']
-        else:
-            state = checkpoint['model']
-    else:
-        raise AttributeError('Only support resume to load model.state_dict by now.')
-
-    # Load train mode state and convert to deploy mode
-    cfg.model.load_state_dict(state)
-
-    class Model(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = cfg.model.eval().cuda()
-            self.postprocessor = cfg.postprocessor.eval().cuda()
-
-        def forward(self, images, orig_target_sizes):
-            outputs = self.model(images)
-            outputs = self.postprocessor(outputs, orig_target_sizes)
-            return outputs
-
-    model = Model()
-    process_dataset(model, args.dataset, args.output, thrh=0.5)
-    # file_path = args.input
-    # if os.path.splitext(file_path)[-1].lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
-    #     process_image(model, file_path)
-    #     print("Image processing complete.")
-    # else:
-    #     process_video(model, file_path)
-
-
-if __name__ == '__main__':
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-c', '--config', type=str, required=True)
-    parser.add_argument('-r', '--resume', type=str, required=True)
-    parser.add_argument('-d', '--dataset', type=str, default='./data/fiftyone/validation/data')
-    parser.add_argument('-o', '--output', type=str, required=True, help="Path to save visualized results")
-    args = parser.parse_args()
-    main(args)
diff --git a/tools/inference/trt_inf.py b/tools/inference/trt_inf.py
deleted file mode 100644
index 96c4d329..00000000
--- a/tools/inference/trt_inf.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""
-Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
-"""
-
-import time
-import contextlib
-import collections
-from collections import OrderedDict
-
-import numpy as np
-from PIL import Image, ImageDraw
-
-import torch
-import torchvision.transforms as T
-
-import tensorrt as trt
-import cv2  # Added for video processing
-import os
-
-class TimeProfiler(contextlib.ContextDecorator):
-    def __init__(self):
-        self.total = 0
-
-    def __enter__(self):
-        self.start = self.time()
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self.total += self.time() - self.start
-
-    def reset(self):
-        self.total = 0
-
-    def time(self):
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        return time.time()
-
-class TRTInference(object):
-    def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False):
-        self.engine_path = engine_path
-        self.device = device
-        self.backend = backend
-        self.max_batch_size = max_batch_size
-
-        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
-
-        self.engine = self.load_engine(engine_path)
-        self.context = self.engine.create_execution_context()
-        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
-        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
-        self.input_names = self.get_input_names()
-        self.output_names = self.get_output_names()
-        self.time_profile = TimeProfiler()
-
-    def load_engine(self, path):
-        trt.init_libnvinfer_plugins(self.logger, '')
-        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
-            return runtime.deserialize_cuda_engine(f.read())
-
-    def get_input_names(self):
-        names = []
-        for _, name in enumerate(self.engine):
-            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
-                names.append(name)
-        return names
-
-    def get_output_names(self):
-        names = []
-        for _, name in enumerate(self.engine):
-            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
-                names.append(name)
-        return names
-
-    def get_bindings(self, engine, context, max_batch_size=32, device=None) -> OrderedDict:
-        Binding = collections.namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
-        bindings = OrderedDict()
-
-        for i, name in enumerate(engine):
-            shape = engine.get_tensor_shape(name)
-            dtype = trt.nptype(engine.get_tensor_dtype(name))
-
-            if shape[0] == -1:
-                shape[0] = max_batch_size
-                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
-                    context.set_input_shape(name, shape)
-
-            data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
-            bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
-
-        return bindings
-
-    def run_torch(self, blob):
-        for n in self.input_names:
-            if self.bindings[n].shape != blob[n].shape:
-                self.context.set_input_shape(n, blob[n].shape)
-                self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
-
-            assert self.bindings[n].data.dtype == blob[n].dtype, '{} dtype mismatch'.format(n)
-
-        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
-        self.context.execute_v2(list(self.bindings_addr.values()))
-        outputs = {n: self.bindings[n].data for n in self.output_names}
-
-        return outputs
-
-    def __call__(self, blob):
-        if self.backend == 'torch':
-            return self.run_torch(blob)
-        else:
-            raise NotImplementedError("Only 'torch' backend is implemented.")
-
-    def synchronize(self):
-        if self.backend == 'torch' and torch.cuda.is_available():
-            torch.cuda.synchronize()
-
-def draw(images, labels, boxes, scores, thrh=0.4):
-    for i, im in enumerate(images):
-        draw = ImageDraw.Draw(im)
-        scr = scores[i]
-        lab = labels[i][scr > thrh]
-        box = boxes[i][scr > thrh]
-        scrs = scr[scr > thrh]
-
-        for j, b in enumerate(box):
-            draw.rectangle(list(b), outline='red')
-            draw.text(
-                (b[0], b[1]),
-                text=f"{lab[j].item()} {round(scrs[j].item(), 2)}",
-                fill='blue',
-            )
-
-    return images
-
-def process_image(m, file_path, device):
-    im_pil = Image.open(file_path).convert('RGB')
-    w, h = im_pil.size
-    orig_size = torch.tensor([w, h])[None].to(device)
-
-    transforms = T.Compose([
-        T.Resize((640, 640)),
-        T.ToTensor(),
-    ])
-    im_data = transforms(im_pil)[None]
-
-    blob = {
-        'images': im_data.to(device),
-        'orig_target_sizes': orig_size.to(device),
-    }
-
-    output = m(blob)
-    result_images = draw([im_pil], output['labels'], output['boxes'], output['scores'])
-    result_images[0].save('trt_result.jpg')
-    print("Image processing complete. Result saved as 'result.jpg'.")
-
-def process_video(m, file_path, device):
-    cap = cv2.VideoCapture(file_path)
-
-    # Get video properties
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-
-    # Define the codec and create VideoWriter object
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter('trt_result.mp4', fourcc, fps, (orig_w, orig_h))
-
-    transforms = T.Compose([
-        T.Resize((640, 640)),
-        T.ToTensor(),
-    ])
-
-    frame_count = 0
-    print("Processing video frames...")
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret:
-            break
-
-        # Convert frame to PIL image
-        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-
-        w, h = frame_pil.size
-        orig_size = torch.tensor([w, h])[None].to(device)
-
-        im_data = transforms(frame_pil)[None]
-
-        blob = {
-            'images': im_data.to(device),
-            'orig_target_sizes': orig_size.to(device),
-        }
-
-        output = m(blob)
-
-        # Draw detections on the frame
-        result_images = draw([frame_pil], output['labels'], output['boxes'], output['scores'])
-
-        # Convert back to OpenCV image
-        frame = cv2.cvtColor(np.array(result_images[0]), cv2.COLOR_RGB2BGR)
-
-        # Write the frame
-        out.write(frame)
-        frame_count += 1
-
-        if frame_count % 10 == 0:
-            print(f"Processed {frame_count} frames...")
-
-    cap.release()
-    out.release()
-    print("Video processing complete. Result saved as 'result_video.mp4'.")
-
-if __name__ == '__main__':
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-trt', '--trt', type=str, required=True)
-    parser.add_argument('-i', '--input', type=str, required=True)
-    parser.add_argument('-d', '--device', type=str, default='cuda:0')
-
-    args = parser.parse_args()
-
-    m = TRTInference(args.trt, device=args.device)
-
-    file_path = args.input
-    if os.path.splitext(file_path)[-1].lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
-        # Process as image
-        process_image(m, file_path, args.device)
-    else:
-        # Process as video
-        process_video(m, file_path, args.device)
diff --git a/tools/reference/convert_weight.py b/tools/reference/convert_weight.py
deleted file mode 100644
index 9651d19a..00000000
--- a/tools/reference/convert_weight.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import torch
-import os
-import argparse
-
-def save_only_ema_weights(checkpoint_file):
-    """Extract and save only the EMA weights."""
-    checkpoint = torch.load(checkpoint_file, map_location='cpu')
-
-    weights = {}
-    if 'ema' in checkpoint:
-        weights['model'] = checkpoint['ema']['module']
-    else:
-        raise ValueError("The checkpoint does not contain 'ema'.")
-
-    dir_name, base_name = os.path.split(checkpoint_file)
-    name, ext = os.path.splitext(base_name)
-    output_file = os.path.join(dir_name, f"{name}_converted{ext}")
-
-    torch.save(weights, output_file)
-    print(f"EMA weights saved to {output_file}")
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extract and save only EMA weights.")
-    parser.add_argument('checkpoint_dir', type=str, help="Path to the input checkpoint file.")
-
-    args = parser.parse_args()
-    for file in os.listdir(args.checkpoint_dir):
-        if '.pth' in file and '_converted' not in file:
-            save_only_ema_weights(os.path.join(args.checkpoint_dir, file))
diff --git a/tools/reference/safe_training.sh b/tools/reference/safe_training.sh
deleted file mode 100644
index d3c752a4..00000000
--- a/tools/reference/safe_training.sh
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/bin/bash
-
-# Function to display the menu for selecting model size
-select_model_size() {
-    echo "Select model size:"
-    select size in s m l x; do
-        case $size in
-            s|m|l|x)
-                echo "You selected model size: $size"
-                MODEL_SIZE=$size
-                break
-                ;;
-            *)
-                echo "Invalid selection. Please try again."
-                    ;;
-        esac
-    done
-}
-
-# Function to display the menu for selecting task
-select_task() {
-    echo "Select task:"
-    select task in obj365 obj2coco coco; do
-        case $task in
-            obj365|obj2coco|coco)
-                echo "You selected task: $task"
-                TASK=$task
-                break
-                ;;
-            *)
-                echo "Invalid selection. Please try again."
-                ;;
-        esac
-    done
-}
-
-# Function to ask if the user wants to save logs to a txt file
-ask_save_logs() {
-    while true; do
-        read -p "Do you want to save logs to a txt file? (y/n): " yn
-        case $yn in
-            [Yy]* )
-                SAVE_LOGS=true
-                break
-                ;;
-            [Nn]* )
-                SAVE_LOGS=false
-                break
-                ;;
-            * ) echo "Please answer yes or no.";;
-        esac
-    done
-}
-
-# Call the functions to let the user select
-select_model_size
-select_task
-ask_save_logs
-
-# Set config file and output directory based on selection
-if [ "$TASK" = "coco" ]; then
-    CONFIG_FILE="configs/dfine/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml"
-else
-    CONFIG_FILE="configs/dfine/objects365/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml"
-fi
-
-OUTPUT_DIR="output/${MODEL_SIZE}_${TASK}"
-
-# Construct the training command
-TRAIN_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR"
-
-# Append log redirection if SAVE_LOGS is true
-if [ "$SAVE_LOGS" = true ]; then
-    LOG_FILE="${MODEL_SIZE}_${TASK}.txt"
-    TRAIN_CMD="$TRAIN_CMD &> \"$LOG_FILE\" 2>&1 &"
-else
-    TRAIN_CMD="$TRAIN_CMD &"
-fi
-
-# Run the training command
-eval $TRAIN_CMD
-if [ $? -ne 0 ]; then
-    echo "First training failed, restarting with resume option..."
-    while true; do
-        RESUME_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR -r ${OUTPUT_DIR}/last.pth"
-        if [ "$SAVE_LOGS" = true ]; then
-            LOG_FILE="${MODEL_SIZE}_${TASK}_2.txt"
-            RESUME_CMD="$RESUME_CMD &> \"$LOG_FILE\" 2>&1 &"
-        else
-            RESUME_CMD="$RESUME_CMD &"
-        fi
-        eval $RESUME_CMD
-        if [ $? -eq 0 ]; then
-            break
-        fi
-    done
-fi
diff --git a/tools/visualization/fiftyone_vis.py b/tools/visualization/fiftyone_vis.py
deleted file mode 100644
index 2d199d0b..00000000
--- a/tools/visualization/fiftyone_vis.py
+++ /dev/null
@@ -1,306 +0,0 @@
-"""
-Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
-"""
-
-import os
-import subprocess
-
-import argparse
-
-import torch
-import fiftyone.core.models as fom
-import fiftyone as fo
-import fiftyone.zoo as foz
-import torchvision.transforms as transforms
-from PIL import Image
-import fiftyone.core.labels as fol
-import fiftyone.core.fields as fof
-from fiftyone import ViewField as F
-import time
-import tqdm
-import sys
-sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../..'))
-from engine.core import YAMLConfig
-
-def kill_existing_mongod():
-    try:
-        result = subprocess.run(['ps', 'aux'], stdout=subprocess.PIPE)
-        processes = result.stdout.decode('utf-8').splitlines()
-
-        for process in processes:
-            if 'mongod' in process and '--dbpath' in process:
-                # find mongod PID
-                pid = int(process.split()[1])
-                print(f"Killing existing mongod process with PID: {pid}")
-                # kill mongod session
-                os.kill(pid, 9)
-    except Exception as e:
-        print(f"Error occurred while killing mongod: {e}")
-
-kill_existing_mongod()
-
-
-label_map = {
-    1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorbike', 5: 'aeroplane',
-    6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'trafficlight',
-    11: 'firehydrant', 12: 'streetsign', 13: 'stopsign', 14: 'parkingmeter',
-    15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse',
-    20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra',
-    25: 'giraffe', 26: 'hat', 27: 'backpack', 28: 'umbrella', 29: 'shoe',
-    30: 'eyeglasses', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee',
-    35: 'skis', 36: 'snowboard', 37: 'sportsball', 38: 'kite', 39: 'baseballbat',
-    40: 'baseballglove', 41: 'skateboard', 42: 'surfboard', 43: 'tennisracket',
-    44: 'bottle', 45: 'plate', 46: 'wineglass', 47: 'cup', 48: 'fork',
-    49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple',
-    54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hotdog',
-    59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'sofa',
-    64: 'pottedplant', 65: 'bed', 66: 'mirror', 67: 'diningtable', 68: 'window',
-    69: 'desk', 70: 'toilet', 71: 'door', 72: 'tv', 73: 'laptop',
-    74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'cellphone', 78: 'microwave',
-    79: 'oven', 80: 'toaster', 81: 'sink', 82: 'refrigerator', 83: 'blender',
-    84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddybear',
-    89: 'hairdrier', 90: 'toothbrush', 91: 'hairbrush'
-}
-
-class CustomModel(fom.Model):
-    def __init__(self, cfg):
-        super().__init__()
-        self.model = cfg.model.eval().cuda()
-        self.postprocessor = cfg.postprocessor.eval().cuda()
-        self.transform = transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Resize((640, 640)),  # Resize to the size expected by your model
-            # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-        ])
-
-    @property
-    def media_type(self):
-        return "image"
-
-    @property
-    def has_logits(self):
-        return False
-
-    @property
-    def has_embeddings(self):
-        return False
-
-    @property
-    def ragged_batches(self):
-        return False
-
-    @property
-    def transforms(self):
-        return None
-
-    @property
-    def preprocess(self):
-        return True
-
-    @preprocess.setter
-    def preprocess(self, value):
-        pass
-
-    def _convert_predictions(self, predictions):
-        class_labels, bboxes, scores = predictions[0]['labels'], predictions[0]['boxes'], predictions[0]['scores']
-
-        detections = []
-        for label, bbox, score in zip(class_labels, bboxes, scores):
-            detection = fol.Detection(
-                label=label_map[label.item()],
-                bounding_box=[
-                    bbox[0] / 640,  # Normalized coordinates
-                    bbox[1] / 640,
-                    (bbox[2] - bbox[0]) / 640,
-                    (bbox[3] - bbox[1]) / 640
-                ],
-                confidence=score
-            )
-            detections.append(detection)
-
-        return fol.Detections(detections=detections)
-
-    def predict(self, image):
-        image = Image.fromarray(image).convert('RGB')
-        image_tensor = self.transform(image).unsqueeze(0).cuda()
-        outputs = self.model(image_tensor)
-        orig_target_sizes = torch.tensor([[640, 640]]).cuda()
-        predictions = self.postprocessor(outputs, orig_target_sizes)
-        return self._convert_predictions(predictions)
-
-    def predict_all(self, images):
-        image_tensors = []
-        for image in images:
-            image = Image.fromarray(image)
-            image_tensor = self.transform(image)
-            image_tensors.append(image_tensor)
-        image_tensors = torch.stack(image_tensors).cuda()
-        outputs = self.model(image_tensors)
-        orig_target_sizes = torch.tensor([[640, 640] for image in images]).cuda()
-        predictions = self.postprocessor(outputs, orig_target_sizes)
-        converted_predictions = [self._convert_predictions(pred) for pred in predictions]
-
-        # Ensure the output is a list of lists of Detections
-        return converted_predictions
-
-def filter_by_predictions5_confidence(predictions_view, confidence_threshold=0.3):
-    for j, sample in tqdm.tqdm(enumerate(predictions_view), total=len(predictions_view)):
-        has_modified = False
-        for i, detection in enumerate(sample["predictions0"].detections):
-
-            if "original_confidence" not in detection:
-                detection["original_confidence"] = detection["confidence"]
-
-            if (detection["confidence"] <= confidence_threshold and sample["predictions5"].detections[i]["confidence"] >= confidence_threshold) or \
-               (detection["confidence"] >= confidence_threshold and sample["predictions5"].detections[i]["confidence"] <= confidence_threshold):
-
-                sample["predictions0"].detections[i]["confidence"] = sample["predictions5"].detections[i]["confidence"]
-                has_modified = True
-        if has_modified:
-            sample.save()
-
-
-def restore_confidence(predictions_view):
-    for j, sample in tqdm.tqdm(enumerate(predictions_view), total=len(predictions_view)):
-        for i, detection in enumerate(sample["predictions0"].detections):
-            if "original_confidence" in detection:
-                detection["confidence"] = detection["original_confidence"]
-        sample.save()
-
-def fast_iou(bbox1, bbox2):
-    x1, y1, w1, h1 = bbox1
-    x2, y2, w2, h2 = bbox2
-    xA = max(x1, x2)
-    yA = max(y1, y2)
-    xB = min(x1 + w1, x2 + w2)
-    yB = min(y1 + h1, y2 + h2)
-    interArea = max(0, xB - xA) * max(0, yB - yA)
-    boxAArea = w1 * h1
-    boxBArea = w2 * h2
-    iou = interArea / float(boxAArea + boxBArea - interArea)
-    return iou
-
-def assign_iou_diff(predictions_view):
-    for sample in predictions_view:
-        ious_0 = [detection.eval0_iou if 'eval0_iou' in detection else None for detection in sample["predictions0"].detections]
-        ious_5 = [detection.eval5_iou if 'eval5_iou' in detection else None for detection in sample["predictions5"].detections]
-        bbox_0 = [detection.bounding_box for detection in sample["predictions0"].detections]
-        bbox_5 = [detection.bounding_box for detection in sample["predictions5"].detections]
-        # iou_diffs = [abs(iou_5 - iou_0) if iou_0 is not None and iou_5 is not None else -1 for iou_0, iou_5 in zip(ious_0, ious_5)]
-        iou_inter = [fast_iou(b0, b5) for b0, b5 in zip(bbox_0, bbox_5)]
-        iou_diffs = [abs(iou_5 - iou_0) if iou_0 is not None and iou_5 is not None and iou_inter > 0.5 else -1 for iou_0, iou_5, iou_inter in zip(ious_0, ious_5, iou_inter)]
-
-        for detection, iou_diff in zip(sample["predictions0"].detections, iou_diffs):
-            detection["iou_diff"] = iou_diff
-        for detection, iou_diff in zip(sample["predictions5"].detections, iou_diffs):
-            detection["iou_diff"] = iou_diff
-        # for detection, iou_diff in zip(sample["predictions100"].detections, iou_diffs):
-        #     detection["iou_diff"] = iou_diff
-        sample.save()
-
-def main(args):
-    try:
-        if os.path.exists("saved_predictions_view") and os.path.exists("saved_filtered_view"):
-            print("Loading saved predictions and filtered views...")
-            dataset = foz.load_zoo_dataset(
-                "coco-2017",
-                split="validation",
-                dataset_name="evaluate-detections-tutorial",
-                dataset_dir="data/fiftyone"
-            )
-
-            dataset.persistent = True
-            session = fo.launch_app(dataset, port=args.port)
-
-            predictions_view = fo.Dataset.from_dir(
-                dataset_dir="saved_predictions_view",
-                dataset_type=fo.types.FiftyOneDataset
-            ).view()
-            filtered_view = fo.Dataset.from_dir(
-                dataset_dir="saved_filtered_view",
-                dataset_type=fo.types.FiftyOneDataset
-            ).view()
-        else:
-            dataset = foz.load_zoo_dataset(
-                "coco-2017",
-                split="validation",
-                dataset_name="evaluate-detections-tutorial",
-                dataset_dir="data/fiftyone"
-            )
-
-            dataset.persistent = True
-
-            session = fo.launch_app(dataset, port=args.port)
-            cfg = YAMLConfig(args.config, resume=args.resume)
-            if 'HGNetv2' in cfg.yaml_cfg:
-                cfg.yaml_cfg['HGNetv2']['pretrained'] = False
-            if args.resume:
-                checkpoint = torch.load(args.resume, map_location='cpu')
-                if 'ema' in checkpoint:
-                    state = checkpoint['ema']['module']
-                else:
-                    state = checkpoint['model']
-            else:
-                raise AttributeError('only support resume to load model.state_dict by now.')
-
-            # NOTE load train mode state -> convert to deploy mode
-            cfg.model.load_state_dict(state)
-            predictions_view = dataset.take(500, seed=51)
-
-            model = CustomModel(cfg)
-            L = model.model.decoder.decoder.eval_idx
-            # Apply models and save predictions in different label fields
-            for i in [L]:
-                model.model.decoder.decoder.eval_idx = i
-                label_field = "predictions{:d}".format(i)
-                predictions_view.apply_model(model, label_field=label_field)
-
-            # filter_by_predictions5_confidence(predictions_view, confidence_threshold=0.3)
-            for i in [L]:
-                label_field = "predictions{:d}".format(i)
-                predictions_view = predictions_view.filter_labels(label_field, F("confidence") > 0.5, only_matches=False)
-                eval_key = "eval{:d}".format(i)
-                _ = predictions_view.evaluate_detections(
-                    label_field,
-                    gt_field="ground_truth",
-                    eval_key=eval_key,
-                    compute_mAP=True,
-                )
-
-            # assign_iou_diff(predictions_view)
-
-            # filtered_view = predictions_view.filter_labels("predictions0", F("iou_diff") > 0.05, only_matches=True)
-            # filtered_view = filtered_view.filter_labels("predictions5", F("iou_diff") > 0.05, only_matches=True)
-            # restore_confidence(filtered_view)
-
-            predictions_view.export(
-                export_dir="saved_predictions_view",
-                dataset_type=fo.types.FiftyOneDataset
-            )
-            # filtered_view.export(
-            #     export_dir="saved_filtered_view",
-            #     dataset_type=fo.types.FiftyOneDataset
-            # )
-
-        # Display the filtered view
-        session.view = predictions_view
-
-        # Keep the session open
-        while True:
-            time.sleep(1)
-    except Exception as e:
-        print(f"An error occurred: {e}")
-    finally:
-        print("Shutting down session")
-        if 'session' in locals():
-            session.close()
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config', '-c', type=str)
-    parser.add_argument('--resume', '-r', type=str)
-    parser.add_argument('--port', '-p', type=int)
-    args = parser.parse_args()
-
-    main(args)
diff --git a/train.py b/train.py
deleted file mode 100644
index 35e46eb1..00000000
--- a/train.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""
-DEIM: DETR with Improved Matching for Fast Convergence
-Copyright (c) 2024 The DEIM Authors. All Rights Reserved.
----------------------------------------------------------------------------------
-Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
-Copyright (c) 2023 lyuwenyu. All Rights Reserved.
-"""
-
-import os
-import sys
-sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
-
-import argparse
-
-from engine.misc import dist_utils
-from engine.core import YAMLConfig, yaml_utils
-from engine.solver import TASKS
-
-debug=False
-
-if debug:
-    import torch
-    def custom_repr(self):
-        return f'{{Tensor:{tuple(self.shape)}}} {original_repr(self)}'
-    original_repr = torch.Tensor.__repr__
-    torch.Tensor.__repr__ = custom_repr
-
-def main(args, ) -> None:
-    """main
-    """
-    dist_utils.setup_distributed(args.print_rank, args.print_method, seed=args.seed)
-
-    assert not all([args.tuning, args.resume]), \
-        'Only support from_scrach or resume or tuning at one time'
-
-
-    update_dict = yaml_utils.parse_cli(args.update)
-    update_dict.update({k: v for k, v in args.__dict__.items() \
-        if k not in ['update', ] and v is not None})
-
-    cfg = YAMLConfig(args.config, **update_dict)
-
-    if args.resume or args.tuning:
-        if 'HGNetv2' in cfg.yaml_cfg:
-            cfg.yaml_cfg['HGNetv2']['pretrained'] = False
-
-    print('cfg: ', cfg.__dict__)
-
-    solver = TASKS[cfg.yaml_cfg['task']](cfg)
-
-    if args.test_only:
-        solver.val()
-    else:
-        solver.fit()
-
-    dist_utils.cleanup()
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-
-    # priority 0
-    parser.add_argument('-c', '--config', type=str, required=True)
-    parser.add_argument('-r', '--resume', type=str, help='resume from checkpoint')
-    parser.add_argument('-t', '--tuning', type=str, help='tuning from checkpoint')
-    parser.add_argument('-d', '--device', type=str, help='device',)
-    parser.add_argument('--seed', type=int, help='exp reproducibility')
-    parser.add_argument('--use-amp', action='store_true', help='auto mixed precision training')
-    parser.add_argument('--output-dir', type=str, help='output directoy')
-    parser.add_argument('--summary-dir', type=str, help='tensorboard summry')
-    parser.add_argument('--test-only', action='store_true', default=False,)
-
-    # priority 1
-    parser.add_argument('-u', '--update', nargs='+', help='update yaml config')
-
-    # env
-    parser.add_argument('--print-method', type=str, default='builtin', help='print method')
-    parser.add_argument('--print-rank', type=int, default=0, help='print rank id')
-
-    parser.add_argument('--local-rank', type=int, help='local rank id')
-    args = parser.parse_args()
-
-    main(args)