diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3e006a2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+data
+work_dirs
+.vscode
+__pycache__/
+*.py[cod]
+*$py.class
+*.ipynb
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..5659b03
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,89 @@
+FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
+
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub \
+    && apt-get update \
+    && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6
+
+# Install OpenMMLab projects
+RUN pip install --no-deps \
+    mmengine==0.7.3 \
+    mmdet==3.0.0 \
+    mmsegmentation==1.0.0 \
+    git+https://github.com/open-mmlab/mmdetection3d.git@22aaa47fdb53ce1870ff92cb7e3f96ae38d17f61
+RUN pip install mmcv==2.0.0 -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.13.0/index.html --no-deps
+
+# Install MinkowskiEngine
+# Feel free to skip nvidia-cuda-dev if minkowski installation is fine
+RUN apt-get update \
+    && apt-get -y install libopenblas-dev nvidia-cuda-dev
+RUN TORCH_CUDA_ARCH_LIST="6.1 7.0 8.6" \
+    pip install git+https://github.com/NVIDIA/MinkowskiEngine.git@02fc608bea4c0549b0a7b00ca1bf15dee4a0b228 -v --no-deps \
+    --install-option="--blas=openblas" \
+    --install-option="--force_cuda"
+
+# Install torch-scatter 
+RUN pip install torch-scatter==2.1.2 -f https://data.pyg.org/whl/torch-1.13.0+cu116.html --no-deps
+
+# Install ScanNet superpoint segmentator
+RUN git clone https://github.com/Karbo123/segmentator.git \
+    && cd segmentator/csrc \
+    && git reset --hard 76efe46d03dd27afa78df972b17d07f2c6cfb696 \
+    && mkdir build \
+    && cd build \
+    && cmake .. \
+        -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` \
+        -DPYTHON_INCLUDE_DIR=$(python -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())") \
+        -DPYTHON_LIBRARY=$(python -c "import distutils.sysconfig as sysconfig; print(sysconfig.get_config_var('LIBDIR'))") \
+        -DCMAKE_INSTALL_PREFIX=`python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())'` \
+    && make \
+    && make install \
+    && cd ../../..
+
+# Install remaining python packages
+RUN pip install --no-deps \
+    spconv-cu116==2.3.6 \
+    addict==2.4.0 \
+    yapf==0.33.0 \
+    termcolor==2.3.0 \
+    packaging==23.1 \
+    numpy==1.24.1 \
+    rich==13.3.5 \
+    opencv-python==4.7.0.72 \
+    pycocotools==2.0.6 \
+    Shapely==1.8.5 \
+    scipy==1.10.1 \
+    terminaltables==3.1.10 \
+    numba==0.57.0 \
+    llvmlite==0.40.0 \
+    pccm==0.4.7 \
+    ccimport==0.4.2 \
+    pybind11==2.10.4 \
+    ninja==1.11.1 \
+    lark==1.1.5 \
+    cumm-cu116==0.4.9 \
+    pyquaternion==0.9.9 \
+    lyft-dataset-sdk==0.0.8 \
+    pandas==2.0.1 \
+    python-dateutil==2.8.2 \
+    matplotlib==3.5.2 \
+    pyparsing==3.0.9 \
+    cycler==0.11.0 \
+    kiwisolver==1.4.4 \
+    scikit-learn==1.2.2 \
+    joblib==1.2.0 \
+    threadpoolctl==3.1.0 \
+    cachetools==5.3.0 \
+    nuscenes-devkit==1.1.10 \
+    trimesh==3.21.6 \
+    open3d==0.17.0 \
+    plotly==5.18.0 \
+    dash==2.14.2 \
+    plyfile==1.0.2 \
+    flask==3.0.0 \
+    werkzeug==3.0.1 \
+    click==8.1.7 \
+    blinker==1.7.0 \
+    itsdangerous==2.1.2 \
+    importlib_metadata==2.1.2 \
+    zipp==3.17.0
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..e4cf43e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,159 @@
+# Attribution-NonCommercial 4.0 International
+
+> *Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible.*
+>
+> ### Using Creative Commons Public Licenses
+>
+> Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses.
+>
+> * __Considerations for licensors:__ Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. [More considerations for licensors](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensors).
+>
+> * __Considerations for the public:__ By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. [More considerations for the public](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensees).
+
+## Creative Commons Attribution-NonCommercial 4.0 International Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
+
+### Section 1 – Definitions.
+
+a. __Adapted Material__ means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
+
+b. __Adapter's License__ means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
+
+c. __Copyright and Similar Rights__ means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
+
+d. __Effective Technological Measures__ means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
+
+e. __Exceptions and Limitations__ means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
+
+f. __Licensed Material__ means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
+
+g. __Licensed Rights__ means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
+
+h. __Licensor__ means the individual(s) or entity(ies) granting rights under this Public License.
+
+i. __NonCommercial__ means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
+
+j. __Share__ means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
+
+k. __Sui Generis Database Rights__ means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
+
+l. __You__ means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
+
+### Section 2 – Scope.
+
+a. ___License grant.___
+
+   1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
+
+       A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
+
+       B. produce, reproduce, and Share Adapted Material for NonCommercial purposes only.
+
+   2. __Exceptions and Limitations.__ For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
+
+   3. __Term.__ The term of this Public License is specified in Section 6(a).
+
+   4. __Media and formats; technical modifications allowed.__ The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
+
+   5. __Downstream recipients.__
+
+        A. __Offer from the Licensor – Licensed Material.__ Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
+
+        B. __No downstream restrictions.__ You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
+
+   6. __No endorsement.__ Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
+
+b. ___Other rights.___
+
+   1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
+
+   2. Patent and trademark rights are not licensed under this Public License.
+
+   3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
+
+### Section 3 – License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the following conditions.
+
+a. ___Attribution.___
+
+   1. If You Share the Licensed Material (including in modified form), You must:
+
+       A. retain the following if it is supplied by the Licensor with the Licensed Material:
+
+         i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
+
+         ii. a copyright notice;
+
+         iii. a notice that refers to this Public License;
+
+         iv. a notice that refers to the disclaimer of warranties;
+
+         v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
+
+       B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
+
+       C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
+
+   2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
+
+   3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
+
+   4. If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License.
+
+### Section 4 – Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
+
+a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
+
+b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
+
+c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
+
+### Section 5 – Disclaimer of Warranties and Limitation of Liability.
+
+a. __Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.__
+
+b. __To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.__
+
+c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
+
+### Section 6 – Term and Termination.
+
+a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
+
+b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
+
+   1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
+
+   2. upon express reinstatement by the Licensor.
+
+   For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
+
+c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
+
+d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
+
+### Section 7 – Other Terms and Conditions.
+
+a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
+
+b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
+
+### Section 8 – Interpretation.
+
+a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
+
+b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
+
+c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
+
+d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
+
+> Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at [creativecommons.org/policies](http://creativecommons.org/policies), Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses.
+>
+> Creative Commons may be contacted at creativecommons.org
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..65846aa
--- /dev/null
+++ b/README.md
@@ -0,0 +1,124 @@
+## OneFormer3D: One Transformer for Unified Point Cloud Segmentation
+
+**News**:
+ * :fire: February, 2024. Oneformer3D is now accepted at CVPR 2024.
+ * :fire: November, 2023. OneFormer3D achieves state-of-the-art in
+   * 3D instance segmentation on ScanNet ([hidden test](https://kaldir.vc.in.tum.de/scannet_benchmark/semantic_instance_3d))
+     [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/oneformer3d-one-transformer-for-unified-point/3d-instance-segmentation-on-scannetv2)](https://paperswithcode.com/sota/3d-instance-segmentation-on-scannetv2?p=oneformer3d-one-transformer-for-unified-point)
+     <details>
+        <summary>leaderboard screenshot</summary>
+        <img src="https://github.com/filaPro/oneformer3d/assets/6030962/e8890fd9-336d-4851-85cb-06fbbb60abe3" alt="ScanNet leaderboard"/>
+     </details>
+   * 3D instance segmentation on S3DIS (6-Fold)
+     [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/oneformer3d-one-transformer-for-unified-point/3d-instance-segmentation-on-s3dis)](https://paperswithcode.com/sota/3d-instance-segmentation-on-s3dis?p=oneformer3d-one-transformer-for-unified-point)
+   * 3D panoptic segmentation on ScanNet
+     [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/oneformer3d-one-transformer-for-unified-point/panoptic-segmentation-on-scannet)](https://paperswithcode.com/sota/panoptic-segmentation-on-scannet?p=oneformer3d-one-transformer-for-unified-point)
+   * 3D object detection on ScanNet (w/o TTA)
+     [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/oneformer3d-one-transformer-for-unified-point/3d-object-detection-on-scannetv2)](https://paperswithcode.com/sota/3d-object-detection-on-scannetv2?p=oneformer3d-one-transformer-for-unified-point)
+   * 3D semantic segmentation on ScanNet (val, w/o extra training data) [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/oneformer3d-one-transformer-for-unified-point/semantic-segmentation-on-scannet)](https://paperswithcode.com/sota/semantic-segmentation-on-scannet?p=oneformer3d-one-transformer-for-unified-point)
+
+This repository contains an implementation of OneFormer3D, a 3D (instance, semantic, and panoptic) segmentation method introduced in our paper:
+
+> **OneFormer3D: One Transformer for Unified Point Cloud Segmentation**<br>
+> [Maksim Kolodiazhnyi](https://github.com/col14m),
+> [Anna Vorontsova](https://github.com/highrut),
+> [Anton Konushin](https://scholar.google.com/citations?user=ZT_k-wMAAAAJ),
+> [Danila Rukhovich](https://github.com/filaPro)
+> <br>
+> Samsung Research<br>
+> https://arxiv.org/abs/2311.14405
+
+### Installation
+
+For convenience, we provide a [Dockerfile](Dockerfile).
+This implementation is based on [mmdetection3d](https://github.com/open-mmlab/mmdetection3d) framework `v1.1.0`. If installing without docker please follow their [getting_started.md](https://github.com/open-mmlab/mmdetection3d/blob/22aaa47fdb53ce1870ff92cb7e3f96ae38d17f61/docs/en/get_started.md).
+
+
+### Getting Started
+
+Please see [test_train.md](https://github.com/open-mmlab/mmdetection3d/blob/22aaa47fdb53ce1870ff92cb7e3f96ae38d17f61/docs/en/user_guides/train_test.md) for basic usage examples.
+For ScanNet and ScanNet200 datasets preprocessing please follow our [instruction](data/scannet). It differs from original mmdetection3d only by adding superpoint clustering. For S3DIS preprocessing we follow original [instruction](https://github.com/open-mmlab/mmdetection3d/tree/22aaa47fdb53ce1870ff92cb7e3f96ae38d17f61/data/s3dis) from mmdetection3d. We also [support](data/structured3d) Structured3D dataset for pre-training.
+
+Important notes:
+ * The metrics from our paper can be achieved in several ways, we just choose the most stable one for each dataset in this repository.
+ * If you are interested in only one of three segmentation tasks, it is possible to achieve slightly better metrics, than declared in our paper. Specifically, increasing `model.criterion.sem_criterion.loss_weight` in config file leads to better semantic metrics, and decreasing improve instance metrics.
+ * All models can be trained with a single GPU with 32 Gb memory (or even 24 Gb for ScanNet dataset). If you face issues with RAM during instance segmentation evaluation at validation or test stages feel free to decrease `model.test_cfg.topk_insts` in config file.
+ * Due to the bug in SpConv we [reshape](tools/fix_spconv_checkpoint.py) backbone weights between train and test stages.
+
+#### ScanNet
+
+For ScanNet we present the model with [SpConv](https://github.com/traveller59/spconv) backbone, superpoint pooling, selecting all queries, and predicting semantics directly from instance queries. Backbone is initialized from [SSTNet](https://github.com/Gorilla-Lab-SCUT/SSTNet) checkpoint. It should be [downloaded](https://github.com/oneformer3d/oneformer3d/releases/download/v1.0/sstnet_scannet.pth) and put to `work_dirs/tmp` before training.
+
+```shell
+# train (with validation)
+python tools/train.py configs/oneformer3d_1xb4_scannet.py
+# test
+python tools/fix_spconv_checkpoint.py \
+    --in-path work_dirs/oneformer3d_1xb4_scannet/epoch_512.pth \
+    --out-path work_dirs/oneformer3d_1xb4_scannet/epoch_512.pth
+python tools/test.py configs/oneformer3d_1xb4_scannet.py \
+    work_dirs/oneformer3d_1xb4_scannet/epoch_512.pth
+
+```
+
+#### ScanNet200
+
+For ScanNet200 we present the model with [MinkowskiEngine](https://github.com/NVIDIA/MinkowskiEngine) backbone, superpoint pooling, selecting all queries, and predicting semantics directly from instance queries. Backbone is initialized from [Mask3D](https://github.com/JonasSchult/Mask3D) checkpoint. It should be [downloaded](https://github.com/oneformer3d/oneformer3d/releases/download/v1.0/mask3d_scannet200.pth) and put to `work_dirs/tmp` before training.
+
+```shell
+# train (with validation)
+python tools/train.py configs/oneformer3d_1xb4_scannet200.py
+# test
+python tools/test.py configs/oneformer3d_1xb4_scannet200.py \
+    work_dirs/oneformer3d_1xb4_scannet/epoch_512.pth
+```
+
+#### S3DIS
+
+For S3DIS we present the model with [SpConv](https://github.com/traveller59/spconv) backbone, w/o superpoint pooling, w/o query selection, and with separate semantic queries. Backbone is pretrained on Structured3D and ScanNet. It can be [downloaded](https://github.com/oneformer3d/oneformer3d/releases/download/v1.0/instance-only-oneformer3d_1xb2_scannet-and-structured3d.pth) and put to `work_dirs/tmp` before training or trained with our code. We train the model on Areas 1, 2, 3, 4, 6 and test on Area 5. To change this split feel free to modify `train_area` and `test_area` parameters in config.
+
+```shell
+# pre-train
+python tools/train.py configs/instance-only-oneformer3d_1xb2_scannet-and-structured3d.py
+python tools/fix_spconv_checkpoint.py \
+    --in-path work_dirs/instance-only-oneformer3d_1xb2_scannet-and-structured3d/iter_600000.pth \
+    --out-path work_dirs/tmp/instance-only-oneformer3d_1xb2_scannet-and-structured3d.pth
+# train (with validation)
+python tools/train.py configs/oneformer3d_1xb2_s3dis-area-5.py
+# test
+python tools/fix_spconv_checkpoint.py \
+    --in-path work_dirs/oneformer3d_1xb2_s3dis-area-5/epoch_512.pth \
+    --out-path work_dirs/oneformer3d_1xb2_s3dis-area-5/epoch_512.pth
+python tools/test.py configs/oneformer3d_1xb2_s3dis-area-5.py \
+    work_dirs/oneformer3d_1xb2_s3dis-area-5/epoch_512.pth
+```
+
+### Models
+
+Metric values in the table are given for the provided checkpoints and may vary a little from the ones in our paper. Due to randomness it may be needed to run training with the same config for several times to achieve the best metrics.
+
+| Dataset | mAP<sub>25</sub> | mAP<sub>50</sub> | mAP | mIoU | PQ | Download |
+|:-------:|:----------------:|:----------------:|:---:|:----:|:--:|:--------:|
+| ScanNet | 86.7 | 78.8 | 59.3 | 76.4 | 70.7 | [model](https://github.com/oneformer3d/oneformer3d/releases/download/v1.0/oneformer3d_1xb4_scannet.pth) &#124; [log](https://github.com/oneformer3d/oneformer3d/releases/download/v1.0/oneformer3d_1xb4_scannet.log) &#124; [config](configs/oneformer3d_1xb4_scannet.py) |
+| ScanNet200 | 44.6 | 40.9 | 30.2 | 29.4 | 29.7 | [model](https://github.com/oneformer3d/oneformer3d/releases/download/v1.0/oneformer3d_1xb4_scannet200.pth) &#124; [log](https://github.com/oneformer3d/oneformer3d/releases/download/v1.0/oneformer3d_1xb4_scannet200.log) &#124; [config](configs/oneformer3d_1xb4_scannet200.py) |
+| S3DIS | 80.6 | 72.7 | 58.0 | 71.9 | 64.6 | [model](https://github.com/oneformer3d/oneformer3d/releases/download/v1.0/oneformer3d_1xb2_s3dis-area-5.pth) &#124; [log](https://github.com/oneformer3d/oneformer3d/releases/download/v1.0/oneformer3d_1xb2_s3dis-area-5.log) &#124; [config](configs/oneformer3d_1xb2_s3dis-area-5.py) |
+
+### Example Predictions
+
+<p align="center">
+  <img src="https://github.com/filaPro/oneformer3d/assets/6030962/12809615-7ed5-46a0-9321-747451862295" alt="ScanNet predictions"/>
+</p>
+
+### Citation
+
+If you find this work useful for your research, please cite our paper:
+
+```
+@misc{kolodiazhnyi2023oneformer3d,
+  url = {https://arxiv.org/abs/2311.14405},
+  author = {Kolodiazhnyi, Maxim and Vorontsova, Anna and Konushin, Anton and Rukhovich, Danila},
+  title = {OneFormer3D: One Transformer for Unified Point Cloud Segmentation},
+  publisher = {arXiv},
+  year = {2023}
+}
+```
diff --git a/configs/instance-only-oneformer3d_1xb2_scannet-and-structured3d.py b/configs/instance-only-oneformer3d_1xb2_scannet-and-structured3d.py
new file mode 100644
index 0000000..b8d22d3
--- /dev/null
+++ b/configs/instance-only-oneformer3d_1xb2_scannet-and-structured3d.py
@@ -0,0 +1,292 @@
+_base_ = ['mmdet3d::_base_/default_runtime.py']
+
+custom_imports = dict(imports=['oneformer3d'])
+
+# model settings
+num_classes_structured3d = 28
+num_classes_scannet = 18
+voxel_size = 0.05
+blocks = 5
+num_channels = 64
+
+model = dict(
+    type='InstanceOnlyOneFormer3D',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    in_channels=6,
+    num_channels=num_channels,
+    num_classes_1dataset=num_classes_structured3d,
+    num_classes_2dataset=num_classes_scannet,
+    prefix_1dataset='structured3d', 
+    prefix_2dataset ='scannet',
+    voxel_size=voxel_size,
+    min_spatial_shape=128,
+    backbone=dict(
+        type='SpConvUNet',
+        num_planes=[num_channels * (i + 1) for i in range(blocks)],
+        return_blocks=True),
+    decoder=dict(
+        type='OneDataQueryDecoder',
+        num_layers=3,
+        num_queries_1dataset=400, 
+        num_queries_2dataset=400,
+        num_classes_1dataset=num_classes_structured3d, 
+        num_classes_2dataset=num_classes_scannet,
+        prefix_1dataset='structured3d', 
+        prefix_2dataset ='scannet',
+        in_channels=num_channels,
+        d_model=256,
+        num_heads=8,
+        hidden_dim=1024,
+        dropout=0.0,
+        activation_fn='gelu',
+        iter_pred=True,
+        attn_mask=True,
+        fix_attention=True),
+    criterion=dict(
+        type='OneDataCriterion',
+        matcher=dict(
+            type='HungarianMatcher',
+            costs=[
+                dict(type='QueryClassificationCost', weight=0.5),
+                dict(type='MaskBCECost', weight=1.0),
+                dict(type='MaskDiceCost', weight=1.0)]),
+        loss_weight=[0.5, 1.0, 1.0, 0.5],
+        non_object_weight=0.05,
+        num_classes_1dataset=num_classes_structured3d,
+        num_classes_2dataset=num_classes_scannet,
+        fix_dice_loss_weight=True,
+        iter_matcher=True),
+    train_cfg=dict(),
+    test_cfg=dict(
+        topk_insts=400, 
+        score_thr=0.0, 
+        npoint_thr=100, 
+        obj_normalization=True,
+        obj_normalization_thr=0.01,
+        sp_score_thr=0.15,
+        nms=True,
+        matrix_nms_kernel='linear'))
+
+# structured3d dataset settings
+data_prefix = dict(
+    pts='points',
+    pts_instance_mask='instance_mask',
+    pts_semantic_mask='semantic_mask')
+dataset_type_structured3d = 'Structured3DSegDataset'
+data_root_structured3d = 'data/structured3d/bins'
+
+class_names_structured3d = (
+    'wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door',
+    'window', 'picture', 'counter', 'desk', 'shelves', 'curtain', 'dresser',
+    'pillow', 'mirror', 'ceiling', 'fridge', 'television', 'night stand',
+    'toilet', 'sink', 'lamp', 'bathtub', 'structure', 'furniture', 'prop')
+metainfo_structured3d = dict(
+    classes=class_names_structured3d,
+    ignore_index=num_classes_structured3d)
+
+train_pipeline_structured3d = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(
+        type='PointSample_',
+        num_points=200000),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointInstClassMapping_',
+        num_classes=num_classes_structured3d,
+        structured3d=True),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.14, 0.14],
+        scale_ratio_range=[0.9, 1.1],
+        translation_std=[0.1, 0.1, 0.1],
+        shift_height=False),
+    dict(type='NormalizePointsColor_', 
+         color_mean=[127.5, 127.5, 127.5]),
+    dict(type='SkipEmptyScene'),
+    dict(
+        type='ElasticTransfrom',
+        gran=[6, 20],
+        mag=[40, 160],
+        voxel_size=voxel_size,
+        p=-1),
+    dict(
+        type='Pack3DDetInputs_',
+        keys=[
+            'points', 'pts_semantic_mask', 'pts_instance_mask',
+            'elastic_coords', 'gt_labels_3d'
+        ])
+]
+
+# scannet dataset settings
+dataset_type_scannet = 'ScanNetDataset'
+data_root_scannet = 'data/scannet/'
+
+class_names_scannet = (
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin')
+metainfo_scannet = dict(
+    classes=class_names_scannet,
+    ignore_index=num_classes_scannet)
+
+train_pipeline_scannet = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointInstClassMapping_',
+        num_classes=num_classes_scannet),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-3.14, 3.14],
+        scale_ratio_range=[0.8, 1.2],
+        translation_std=[0.1, 0.1, 0.1],
+        shift_height=False),
+    dict(type='NormalizePointsColor_', 
+         color_mean=[127.5, 127.5, 127.5]),
+    dict(
+        type='ElasticTransfrom',
+        gran=[6, 20],
+        mag=[40, 160],
+        voxel_size=voxel_size),
+    dict(
+        type='Pack3DDetInputs_',
+        keys=[
+            'points', 'gt_labels_3d', 'pts_semantic_mask', 
+            'pts_instance_mask', 'elastic_coords'
+        ])
+]
+test_pipeline_scannet = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='NormalizePointsColor',
+                color_mean=[127.5, 127.5, 127.5]),
+        ]),
+    dict(type='Pack3DDetInputs_', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=6,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type='ConcatDataset_',
+        datasets=[
+            dict(
+                type=dataset_type_structured3d,
+                data_root=data_root_structured3d,
+                ann_file='structured3d_infos_train.pkl',
+                metainfo=metainfo_structured3d,
+                data_prefix=data_prefix,
+                pipeline=train_pipeline_structured3d,
+                ignore_index=num_classes_structured3d,
+                scene_idxs=None,
+                test_mode=False),
+            dict(
+                type='RepeatDataset',
+                times=10,
+                dataset=dict(
+                    type=dataset_type_scannet,
+                    data_root=data_root_scannet,
+                    ann_file='scannet_oneformer3d_infos_train.pkl',
+                    data_prefix=data_prefix,
+                    metainfo=metainfo_scannet,
+                    pipeline=train_pipeline_scannet,
+                    test_mode=False)),
+            dict(
+                type='RepeatDataset',
+                times=10,
+                dataset=dict(
+                    type=dataset_type_scannet,
+                    data_root=data_root_scannet,
+                    ann_file='scannet_oneformer3d_infos_val.pkl',
+                    data_prefix=data_prefix,
+                    metainfo=metainfo_scannet,
+                    pipeline=train_pipeline_scannet,
+                    test_mode=False))]))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type_scannet,
+        data_root=data_root_scannet,
+        ann_file='scannet_oneformer3d_infos_val.pkl',
+        metainfo=metainfo_scannet,
+        data_prefix=data_prefix,
+        pipeline=test_pipeline_scannet,
+        test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='InstanceSegMetric_')
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.05),
+    clip_grad=dict(max_norm=10, norm_type=2))
+param_scheduler = dict(type='PolyLR', begin=0, end=600000, 
+                       power=0.9, by_epoch=False)
+log_processor = dict(by_epoch=False)
+
+custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=25000))
+
+train_cfg = dict(
+    type='IterBasedTrainLoop',  # Use iter-based training loop
+    max_iters=600000,  # Maximum iterations
+    val_interval=25000)  # Validation interval
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/oneformer3d_1xb2_s3dis-area-5.py b/configs/oneformer3d_1xb2_s3dis-area-5.py
new file mode 100644
index 0000000..7cef879
--- /dev/null
+++ b/configs/oneformer3d_1xb2_s3dis-area-5.py
@@ -0,0 +1,229 @@
+_base_ = [
+    'mmdet3d::_base_/default_runtime.py',
+]
+custom_imports = dict(imports=['oneformer3d'])
+
+# model settings
+num_channels = 64
+num_instance_classes = 13
+num_semantic_classes = 13
+
+model = dict(
+    type='S3DISOneFormer3D',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    in_channels=6,
+    num_channels=num_channels,
+    voxel_size=0.05,
+    num_classes=num_instance_classes,
+    min_spatial_shape=128,
+    backbone=dict(
+        type='SpConvUNet',
+        num_planes=[num_channels * (i + 1) for i in range(5)],
+        return_blocks=True),
+    decoder=dict(
+        type='QueryDecoder',
+        num_layers=3,
+        num_classes=num_instance_classes,
+        num_instance_queries=400,
+        num_semantic_queries=num_semantic_classes,
+        num_instance_classes=num_instance_classes,
+        in_channels=num_channels,
+        d_model=256,
+        num_heads=8,
+        hidden_dim=1024,
+        dropout=0.0,
+        activation_fn='gelu',
+        iter_pred=True,
+        attn_mask=True,
+        fix_attention=True,
+        objectness_flag=True),
+    criterion=dict(
+        type='S3DISUnifiedCriterion',
+        num_semantic_classes=num_semantic_classes,
+        sem_criterion=dict(
+            type='S3DISSemanticCriterion',
+            loss_weight=5.0),
+        inst_criterion=dict(
+            type='InstanceCriterion',
+            matcher=dict(
+                type='HungarianMatcher',
+                costs=[
+                    dict(type='QueryClassificationCost', weight=0.5),
+                    dict(type='MaskBCECost', weight=1.0),
+                    dict(type='MaskDiceCost', weight=1.0)]),
+            loss_weight=[0.5, 1.0, 1.0, 0.5],
+            num_classes=num_instance_classes,
+            non_object_weight=0.05,
+            fix_dice_loss_weight=True,
+            iter_matcher=True,
+            fix_mean_loss=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        topk_insts=450,
+        inst_score_thr=0.0,
+        pan_score_thr=0.4,
+        npoint_thr=300,
+        obj_normalization=True,
+        obj_normalization_thr=0.01,
+        sp_score_thr=0.15,
+        nms=True,
+        matrix_nms_kernel='linear',
+        num_sem_cls=num_semantic_classes,
+        stuff_cls=[0, 1, 2, 3, 4, 5, 6, 12],
+        thing_cls=[7, 8, 9, 10, 11]))
+
+# dataset settings
+dataset_type = 'S3DISSegDataset_'
+data_root = 'data/s3dis/'
+data_prefix = dict(
+    pts='points',
+    pts_instance_mask='instance_mask',
+    pts_semantic_mask='semantic_mask')
+
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_label_3d=False,
+        with_bbox_3d=False,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(
+        type='PointSample_',
+        num_points=180000),
+    dict(type='PointInstClassMapping_',
+        num_classes=num_instance_classes),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0.0, 0.0],
+        scale_ratio_range=[0.9, 1.1],
+        translation_std=[.1, .1, .1],
+        shift_height=False),
+    dict(
+        type='NormalizePointsColor_',
+        color_mean=[127.5, 127.5, 127.5]),
+    dict(
+        type='Pack3DDetInputs_',
+        keys=[
+            'points', 'gt_labels_3d',
+            'pts_semantic_mask', 'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='NormalizePointsColor_',
+                color_mean=[127.5, 127.5, 127.5])]),
+    dict(type='Pack3DDetInputs_', keys=['points'])
+]
+
+# run settings
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+            type='ConcatDataset',
+            datasets=([
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file=f's3dis_infos_Area_{i}.pkl',
+                    pipeline=train_pipeline,
+                    filter_empty_gt=True,
+                    data_prefix=data_prefix,
+                    box_type_3d='Depth',
+                    backend_args=None) for i in train_area])))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        test_mode=True,
+        data_prefix=data_prefix,
+        box_type_3d='Depth',
+        backend_args=None))
+test_dataloader = val_dataloader
+
+class_names = [
+    'ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+    'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter', 'unlabeled']
+label2cat = {i: name for i, name in enumerate(class_names)}
+metric_meta = dict(
+    label2cat=label2cat,
+    ignore_index=[num_semantic_classes],
+    classes=class_names,
+    dataset_name='S3DIS')
+sem_mapping = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+
+val_evaluator = dict(
+    type='UnifiedSegMetric',
+    stuff_class_inds=[0, 1, 2, 3, 4, 5, 6, 12],
+    thing_class_inds=[7, 8, 9, 10, 11],
+    min_num_points=1,
+    id_offset=2**16,
+    sem_mapping=sem_mapping,
+    inst_mapping=sem_mapping,
+    submission_prefix_semantic=None,
+    submission_prefix_instance=None,
+    metric_meta=metric_meta)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.05),
+    clip_grad=dict(max_norm=10, norm_type=2))
+param_scheduler = dict(type='PolyLR', begin=0, end=512, power=0.9)
+
+custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
+default_hooks = dict(
+    checkpoint=dict(
+        interval=16,
+        max_keep_ckpts=1,
+        save_best=['all_ap_50%', 'miou'],
+        rule='greater'))
+
+load_from = 'work_dirs/tmp/instance-only-oneformer3d_1xb2_scannet-and-structured3d.pth'
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=512, val_interval=16)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/oneformer3d_1xb4_scannet.py b/configs/oneformer3d_1xb4_scannet.py
new file mode 100644
index 0000000..e695b19
--- /dev/null
+++ b/configs/oneformer3d_1xb4_scannet.py
@@ -0,0 +1,234 @@
+_base_ = [
+    'mmdet3d::_base_/default_runtime.py',
+    'mmdet3d::_base_/datasets/scannet-seg.py'
+]
+custom_imports = dict(imports=['oneformer3d'])
+
+# model settings
+num_channels = 32
+num_instance_classes = 18
+num_semantic_classes = 20
+
+model = dict(
+    type='ScanNetOneFormer3D',
+    data_preprocessor=dict(type='Det3DDataPreprocessor_'),
+    in_channels=6,
+    num_channels=num_channels,
+    voxel_size=0.02,
+    num_classes=num_instance_classes,
+    min_spatial_shape=128,
+    query_thr=0.5,
+    backbone=dict(
+        type='SpConvUNet',
+        num_planes=[num_channels * (i + 1) for i in range(5)],
+        return_blocks=True),
+    decoder=dict(
+        type='ScanNetQueryDecoder',
+        num_layers=6,
+        num_instance_queries=0,
+        num_semantic_queries=0,
+        num_instance_classes=num_instance_classes,
+        num_semantic_classes=num_semantic_classes,
+        num_semantic_linears=1,
+        in_channels=32,
+        d_model=256,
+        num_heads=8,
+        hidden_dim=1024,
+        dropout=0.0,
+        activation_fn='gelu',
+        iter_pred=True,
+        attn_mask=True,
+        fix_attention=True,
+        objectness_flag=False),
+    criterion=dict(
+        type='ScanNetUnifiedCriterion',
+        num_semantic_classes=num_semantic_classes,
+        sem_criterion=dict(
+            type='ScanNetSemanticCriterion',
+            ignore_index=num_semantic_classes,
+            loss_weight=0.2),
+        inst_criterion=dict(
+            type='InstanceCriterion',
+            matcher=dict(
+                type='SparseMatcher',
+                costs=[
+                    dict(type='QueryClassificationCost', weight=0.5),
+                    dict(type='MaskBCECost', weight=1.0),
+                    dict(type='MaskDiceCost', weight=1.0)],
+                topk=1),
+            loss_weight=[0.5, 1.0, 1.0, 0.5],
+            num_classes=num_instance_classes,
+            non_object_weight=0.1,
+            fix_dice_loss_weight=True,
+            iter_matcher=True,
+            fix_mean_loss=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        topk_insts=600,
+        inst_score_thr=0.0,
+        pan_score_thr=0.5,
+        npoint_thr=100,
+        obj_normalization=True,
+        sp_score_thr=0.4,
+        nms=True,
+        matrix_nms_kernel='linear',
+        stuff_classes=[0, 1]))
+
+# dataset settings
+dataset_type = 'ScanNetSegDataset_'
+data_prefix = dict(
+    pts='points',
+    pts_instance_mask='instance_mask',
+    pts_semantic_mask='semantic_mask',
+    sp_pts_mask='super_points')
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D_',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        with_sp_mask_3d=True),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-3.14, 3.14],
+        scale_ratio_range=[0.8, 1.2],
+        translation_std=[0.1, 0.1, 0.1],
+        shift_height=False),
+    dict(
+        type='NormalizePointsColor_',
+        color_mean=[127.5, 127.5, 127.5]),
+    dict(
+        type='AddSuperPointAnnotations',
+        num_classes=num_semantic_classes,
+        stuff_classes=[0, 1],
+        merge_non_stuff_cls=False),
+    dict(
+        type='ElasticTransfrom',
+        gran=[6, 20],
+        mag=[40, 160],
+        voxel_size=0.02,
+        p=0.5),
+    dict(
+        type='Pack3DDetInputs_',
+        keys=[
+            'points', 'gt_labels_3d', 'pts_semantic_mask', 'pts_instance_mask',
+            'sp_pts_mask', 'gt_sp_masks', 'elastic_coords'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D_',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        with_sp_mask_3d=True),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='NormalizePointsColor_',
+                color_mean=[127.5, 127.5, 127.5]),
+            dict(
+                type='AddSuperPointAnnotations',
+                num_classes=num_semantic_classes,
+                stuff_classes=[0, 1],
+                merge_non_stuff_cls=False),
+        ]),
+    dict(type='Pack3DDetInputs_', keys=['points', 'sp_pts_mask'])
+]
+
+# run settings
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=6,
+    dataset=dict(
+        type=dataset_type,
+        ann_file='scannet_oneformer3d_infos_train.pkl',
+        data_prefix=data_prefix,
+        pipeline=train_pipeline,
+        ignore_index=num_semantic_classes,
+        scene_idxs=None,
+        test_mode=False))
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        ann_file='scannet_oneformer3d_infos_val.pkl',
+        data_prefix=data_prefix,
+        pipeline=test_pipeline,
+        ignore_index=num_semantic_classes,
+        test_mode=True))
+test_dataloader = val_dataloader
+
+class_names = [
+    'wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+    'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+    'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+    'bathtub', 'otherfurniture']
+class_names += ['unlabeled']
+label2cat = {i: name for i, name in enumerate(class_names)}
+metric_meta = dict(
+    label2cat=label2cat,
+    ignore_index=[num_semantic_classes],
+    classes=class_names,
+    dataset_name='ScanNet')
+
+sem_mapping = [
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39]
+inst_mapping = sem_mapping[2:]
+val_evaluator = dict(
+    type='UnifiedSegMetric',
+    stuff_class_inds=[0, 1], 
+    thing_class_inds=list(range(2, num_semantic_classes)), 
+    min_num_points=1, 
+    id_offset=2**16,
+    sem_mapping=sem_mapping,
+    inst_mapping=inst_mapping,
+    metric_meta=metric_meta)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.05),
+    clip_grad=dict(max_norm=10, norm_type=2))
+
+param_scheduler = dict(type='PolyLR', begin=0, end=512, power=0.9)
+
+custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
+default_hooks = dict(
+    checkpoint=dict(interval=1, max_keep_ckpts=16))
+
+load_from = 'work_dirs/tmp/sstnet_scannet.pth'
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=512,
+    dynamic_intervals=[(1, 16), (512 - 16, 1)])
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/oneformer3d_1xb4_scannet200.py b/configs/oneformer3d_1xb4_scannet200.py
new file mode 100644
index 0000000..36c3fb1
--- /dev/null
+++ b/configs/oneformer3d_1xb4_scannet200.py
@@ -0,0 +1,302 @@
+_base_ = [
+    'mmdet3d::_base_/default_runtime.py',
+    'mmdet3d::_base_/datasets/scannet-seg.py'
+]
+custom_imports = dict(imports=['oneformer3d'])
+
+# model settings
+num_instance_classes = 198
+num_semantic_classes = 200
+
+model = dict(
+    type='ScanNet200OneFormer3D',
+    data_preprocessor=dict(type='Det3DDataPreprocessor_'),
+    voxel_size=0.02,
+    num_classes=num_instance_classes,
+    query_thr=0.5,
+    backbone=dict(
+        type='Res16UNet34C',
+        in_channels=3,
+        out_channels=96,
+        config=dict(
+            dilations=[1, 1, 1, 1],
+            conv1_kernel_size=5,
+            bn_momentum=0.02)),
+    decoder=dict(
+        type='ScanNetQueryDecoder',
+        num_layers=6,
+        num_instance_queries=0,
+        num_semantic_queries=0,
+        num_instance_classes=num_instance_classes,
+        num_semantic_classes=num_semantic_classes,
+        num_semantic_linears=1,
+        in_channels=96,
+        d_model=256,
+        num_heads=8,
+        hidden_dim=1024,
+        dropout=0.0,
+        activation_fn='gelu',
+        iter_pred=True,
+        attn_mask=True,
+        fix_attention=True,
+        objectness_flag=False),
+    criterion=dict(
+        type='ScanNetUnifiedCriterion',
+        num_semantic_classes=num_semantic_classes,
+        sem_criterion=dict(
+            type='ScanNetSemanticCriterion',
+            ignore_index=num_semantic_classes,
+            loss_weight=0.5),
+        inst_criterion=dict(
+            type='InstanceCriterion',
+            matcher=dict(
+                type='SparseMatcher',
+                costs=[
+                    dict(type='QueryClassificationCost', weight=0.5),
+                    dict(type='MaskBCECost', weight=1.0),
+                    dict(type='MaskDiceCost', weight=1.0)],
+                topk=1),
+            loss_weight=[0.5, 1.0, 1.0, 0.5],
+            num_classes=num_instance_classes,
+            non_object_weight=0.1,
+            fix_dice_loss_weight=True,
+            iter_matcher=True,
+            fix_mean_loss=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        topk_insts=600,
+        inst_score_thr=0.0,
+        pan_score_thr=0.5,
+        npoint_thr=100,
+        obj_normalization=True,
+        sp_score_thr=0.4,
+        nms=True,
+        matrix_nms_kernel='linear',
+        stuff_classes=[0, 1]))
+
+# dataset settings
+dataset_type = 'ScanNet200SegDataset_'
+data_root = 'data/scannet200/'
+data_prefix = dict(
+    pts='points',
+    pts_instance_mask='instance_mask',
+    pts_semantic_mask='semantic_mask',
+    sp_pts_mask='super_points')
+
+# floor and chair are changed
+class_names = [
+    'wall', 'floor', 'chair', 'table', 'door', 'couch', 'cabinet', 'shelf',
+    'desk', 'office chair', 'bed', 'pillow', 'sink', 'picture', 'window',
+    'toilet', 'bookshelf', 'monitor', 'curtain', 'book', 'armchair',
+    'coffee table', 'box', 'refrigerator', 'lamp', 'kitchen cabinet', 'towel',
+    'clothes', 'tv', 'nightstand', 'counter', 'dresser', 'stool', 'cushion',
+    'plant', 'ceiling', 'bathtub', 'end table', 'dining table', 'keyboard',
+    'bag', 'backpack', 'toilet paper', 'printer', 'tv stand', 'whiteboard',
+    'blanket', 'shower curtain', 'trash can', 'closet', 'stairs', 'microwave',
+    'stove', 'shoe', 'computer tower', 'bottle', 'bin', 'ottoman', 'bench',
+    'board', 'washing machine', 'mirror', 'copier', 'basket', 'sofa chair',
+    'file cabinet', 'fan', 'laptop', 'shower', 'paper', 'person',
+    'paper towel dispenser', 'oven', 'blinds', 'rack', 'plate', 'blackboard',
+    'piano', 'suitcase', 'rail', 'radiator', 'recycling bin', 'container',
+    'wardrobe', 'soap dispenser', 'telephone', 'bucket', 'clock', 'stand',
+    'light', 'laundry basket', 'pipe', 'clothes dryer', 'guitar',
+    'toilet paper holder', 'seat', 'speaker', 'column', 'bicycle', 'ladder',
+    'bathroom stall', 'shower wall', 'cup', 'jacket', 'storage bin',
+    'coffee maker', 'dishwasher', 'paper towel roll', 'machine', 'mat',
+    'windowsill', 'bar', 'toaster', 'bulletin board', 'ironing board',
+    'fireplace', 'soap dish', 'kitchen counter', 'doorframe',
+    'toilet paper dispenser', 'mini fridge', 'fire extinguisher', 'ball',
+    'hat', 'shower curtain rod', 'water cooler', 'paper cutter', 'tray',
+    'shower door', 'pillar', 'ledge', 'toaster oven', 'mouse',
+    'toilet seat cover dispenser', 'furniture', 'cart', 'storage container',
+    'scale', 'tissue box', 'light switch', 'crate', 'power outlet',
+    'decoration', 'sign', 'projector', 'closet door', 'vacuum cleaner',
+    'candle', 'plunger', 'stuffed animal', 'headphones', 'dish rack',
+    'broom', 'guitar case', 'range hood', 'dustpan', 'hair dryer',
+    'water bottle', 'handicap bar', 'purse', 'vent', 'shower floor',
+    'water pitcher', 'mailbox', 'bowl', 'paper bag', 'alarm clock',
+    'music stand', 'projector screen', 'divider', 'laundry detergent',
+    'bathroom counter', 'object', 'bathroom vanity', 'closet wall',
+    'laundry hamper', 'bathroom stall door', 'ceiling light', 'trash bin',
+    'dumbbell', 'stair rail', 'tube', 'bathroom cabinet', 'cd case',
+    'closet rod', 'coffee kettle', 'structure', 'shower head',
+    'keyboard piano', 'case of water bottles', 'coat rack',
+    'storage organizer', 'folded chair', 'fire alarm', 'power strip',
+    'calendar', 'poster', 'potted plant', 'luggage', 'mattress'
+]
+
+color_mean = (
+    0.47793125906962 * 255,
+    0.4303257521323044 * 255,
+    0.3749598901421883 * 255)
+color_std = (
+    0.2834475483823543 * 255,
+    0.27566157565723015 * 255,
+    0.27018971370874995 * 255)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D_',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        with_sp_mask_3d=True),
+    dict(type='SwapChairAndFloor'),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-3.14, 3.14],
+        scale_ratio_range=[0.8, 1.2],
+        translation_std=[0.1, 0.1, 0.1],
+        shift_height=False),
+    dict(
+        type='NormalizePointsColor_',
+        color_mean=color_mean,
+        color_std=color_std),
+    dict(
+        type='AddSuperPointAnnotations',
+        num_classes=num_semantic_classes,
+        stuff_classes=[0, 1],
+        merge_non_stuff_cls=False),
+    dict(
+        type='ElasticTransfrom',
+        gran=[6, 20],
+        mag=[40, 160],
+        voxel_size=0.02,
+        p=0.5),
+    dict(
+        type='Pack3DDetInputs_',
+        keys=[
+            'points', 'gt_labels_3d', 'pts_semantic_mask', 'pts_instance_mask',
+            'sp_pts_mask', 'gt_sp_masks', 'elastic_coords'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D_',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        with_sp_mask_3d=True),
+    dict(type='SwapChairAndFloor'),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='NormalizePointsColor_',
+                color_mean=color_mean,
+                color_std=color_std),
+            dict(
+                type='AddSuperPointAnnotations',
+                num_classes=num_semantic_classes,
+                stuff_classes=[0, 1],
+                merge_non_stuff_cls=False),
+        ]),
+    dict(type='Pack3DDetInputs_', keys=['points', 'sp_pts_mask'])
+]
+
+# run settings
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=6,
+    dataset=dict(
+        type=dataset_type,
+        ann_file='scannet200_oneformer3d_infos_train.pkl',
+        data_root=data_root,
+        data_prefix=data_prefix,
+        metainfo=dict(classes=class_names),
+        pipeline=train_pipeline,
+        ignore_index=num_semantic_classes,
+        scene_idxs=None,
+        test_mode=False))
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        ann_file='scannet200_oneformer3d_infos_val.pkl',
+        data_root=data_root,
+        data_prefix=data_prefix,
+        metainfo=dict(classes=class_names),
+        pipeline=test_pipeline,
+        ignore_index=num_semantic_classes,
+        test_mode=True))
+test_dataloader = val_dataloader
+
+label2cat = {i: name for i, name in enumerate(class_names + ['unlabeled'])}
+metric_meta = dict(
+    label2cat=label2cat,
+    ignore_index=[num_semantic_classes],
+    classes=class_names + ['unlabeled'],
+    dataset_name='ScanNet200')
+
+sem_mapping = [
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23,
+    24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 45, 46,
+    47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 62, 63, 64, 65, 66, 67, 68,
+    69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86, 87, 88, 89, 90,
+    93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 110, 112,
+    115, 116, 118, 120, 121, 122, 125, 128, 130, 131, 132, 134, 136, 138, 139,
+    140, 141, 145, 148, 154, 155, 156, 157, 159, 161, 163, 165, 166, 168, 169,
+    170, 177, 180, 185, 188, 191, 193, 195, 202, 208, 213, 214, 221, 229, 230,
+    232, 233, 242, 250, 261, 264, 276, 283, 286, 300, 304, 312, 323, 325, 331,
+    342, 356, 370, 392, 395, 399, 408, 417, 488, 540, 562, 570, 572, 581, 609,
+    748, 776, 1156, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172,
+    1173, 1174, 1175, 1176, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185,
+    1186, 1187, 1188, 1189, 1190, 1191
+]
+inst_mapping = sem_mapping[2:]
+
+val_evaluator = dict(
+    type='UnifiedSegMetric',
+    stuff_class_inds=[0, 1], 
+    thing_class_inds=list(range(2, num_semantic_classes)),
+    min_num_points=1, 
+    id_offset=2**16,
+    sem_mapping=sem_mapping,
+    inst_mapping=inst_mapping,
+    metric_meta=metric_meta)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.05),
+    clip_grad=dict(max_norm=10, norm_type=2))
+param_scheduler = dict(type='PolyLR', begin=0, end=512, power=0.9)
+
+custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
+default_hooks = dict(
+    checkpoint=dict(
+        interval=1,
+        max_keep_ckpts=1,
+        save_best=['all_ap_50%', 'miou'],
+        rule='greater'))
+
+load_from = 'work_dirs/tmp/mask3d_scannet200.pth'
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=512, val_interval=16)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/data/scannet/README.md b/data/scannet/README.md
new file mode 100644
index 0000000..35f3267
--- /dev/null
+++ b/data/scannet/README.md
@@ -0,0 +1,67 @@
+### Prepare ScanNet Data for Indoor Detection or Segmentation Task
+
+We follow the procedure in [votenet](https://github.com/facebookresearch/votenet/).
+
+1. Download ScanNet v2 data [HERE](https://github.com/ScanNet/ScanNet). Link or move the 'scans' folder to this level of directory. If you are performing segmentation tasks and want to upload the results to its official [benchmark](http://kaldir.vc.in.tum.de/scannet_benchmark/), please also link or move the 'scans_test' folder to this directory.
+
+2. In this directory, extract point clouds and annotations by running `python batch_load_scannet_data.py`. Add the `--scannet200` flag if you want to get markup for the ScanNet200 dataset.
+
+3. Enter the project root directory, generate training data by running
+
+```bash
+python tools/create_data.py scannet --root-path ./data/scannet --out-dir ./data/scannet --extra-tag scannet
+```
+&nbsp; &nbsp; &nbsp; &nbsp; or for ScanNet200:
+
+```bash
+mkdir data/scannet200
+python tools/create_data.py scannet200 --root-path ./data/scannet --out-dir ./data/scannet200 --extra-tag scannet200
+```
+
+The overall process for ScanNet could be achieved through the following script
+
+```bash
+python batch_load_scannet_data.py
+cd ../..
+python tools/create_data.py scannet --root-path ./data/scannet --out-dir ./data/scannet --extra-tag scannet
+```
+
+Or for ScanNet200:
+
+```bash
+python batch_load_scannet_data.py --scannet200
+cd ../..
+mkdir data/scannet200
+python tools/create_data.py scannet200 --root-path ./data/scannet --out-dir ./data/scannet200 --extra-tag scannet200
+```
+
+The directory structure after pre-processing should be as below
+
+```
+scannet
+├── meta_data
+├── batch_load_scannet_data.py
+├── load_scannet_data.py
+├── scannet_utils.py
+├── README.md
+├── scans
+├── scans_test
+├── scannet_instance_data
+├── points
+│   ├── xxxxx.bin
+├── instance_mask
+│   ├── xxxxx.bin
+├── semantic_mask
+│   ├── xxxxx.bin
+├── super_points
+│   ├── xxxxx.bin
+├── seg_info
+│   ├── train_label_weight.npy
+│   ├── train_resampled_scene_idxs.npy
+│   ├── val_label_weight.npy
+│   ├── val_resampled_scene_idxs.npy
+├── scannet_oneformer3d_infos_train.pkl
+├── scannet_oneformer3d_infos_val.pkl
+├── scannet_oneformer3d_infos_test.pkl
+
+```
diff --git a/data/scannet/batch_load_scannet_data.py b/data/scannet/batch_load_scannet_data.py
new file mode 100644
index 0000000..7a8f514
--- /dev/null
+++ b/data/scannet/batch_load_scannet_data.py
@@ -0,0 +1,187 @@
+# Modified from
+# https://github.com/facebookresearch/votenet/blob/master/scannet/batch_load_scannet_data.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Batch mode in loading Scannet scenes with vertices and ground truth labels
+for semantic and instance segmentations.
+
+Usage example: python ./batch_load_scannet_data.py
+"""
+import argparse
+import datetime
+import os
+from os import path as osp
+
+import torch
+import segmentator
+import open3d as o3d
+import numpy as np
+from load_scannet_data import export
+
+DONOTCARE_CLASS_IDS = np.array([])
+
+SCANNET_OBJ_CLASS_IDS = np.array(
+    [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39])
+
+SCANNET200_OBJ_CLASS_IDS = np.array([2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+                                    72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86, 87, 88, 89, 90, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 110, 112, 115, 116, 118, 120, 121, 122, 125, 128, 130, 131, 132, 134, 136, 138, 139, 140, 141, 145, 148, 154,
+                                    155, 156, 157, 159, 161, 163, 165, 166, 168, 169, 170, 177, 180, 185, 188, 191, 193, 195, 202, 208, 213, 214, 221, 229, 230, 232, 233, 242, 250, 261, 264, 276, 283, 286, 300, 304, 312, 323, 325, 331, 342, 356, 370, 392, 395, 399, 408, 417,
+                                    488, 540, 562, 570, 572, 581, 609, 748, 776, 1156, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191])
+
+
+
+def export_one_scan(scan_name,
+                    output_filename_prefix,
+                    max_num_point,
+                    label_map_file,
+                    scannet_dir,
+                    test_mode=False,
+                    scannet200=False):
+    mesh_file = osp.join(scannet_dir, scan_name, scan_name + '_vh_clean_2.ply')
+    agg_file = osp.join(scannet_dir, scan_name,
+                        scan_name + '.aggregation.json')
+    seg_file = osp.join(scannet_dir, scan_name,
+                        scan_name + '_vh_clean_2.0.010000.segs.json')
+    # includes axisAlignment info for the train set scans.
+    meta_file = osp.join(scannet_dir, scan_name, f'{scan_name}.txt')
+    mesh_vertices, semantic_labels, instance_labels, unaligned_bboxes, \
+        aligned_bboxes, instance2semantic, axis_align_matrix = export(
+            mesh_file, agg_file, seg_file, meta_file, label_map_file, None,
+            test_mode, scannet200)
+
+    if not test_mode:
+        mask = np.logical_not(np.in1d(semantic_labels, DONOTCARE_CLASS_IDS))
+        mesh_vertices = mesh_vertices[mask, :]
+        semantic_labels = semantic_labels[mask]
+        instance_labels = instance_labels[mask]
+
+        num_instances = len(np.unique(instance_labels))
+        print(f'Num of instances: {num_instances}')
+        if scannet200:
+            OBJ_CLASS_IDS = SCANNET200_OBJ_CLASS_IDS
+        else:
+            OBJ_CLASS_IDS = SCANNET_OBJ_CLASS_IDS
+
+        bbox_mask = np.in1d(unaligned_bboxes[:, -1], OBJ_CLASS_IDS)
+        unaligned_bboxes = unaligned_bboxes[bbox_mask, :]
+        bbox_mask = np.in1d(aligned_bboxes[:, -1], OBJ_CLASS_IDS)
+        aligned_bboxes = aligned_bboxes[bbox_mask, :]
+        assert unaligned_bboxes.shape[0] == aligned_bboxes.shape[0]
+        print(f'Num of care instances: {unaligned_bboxes.shape[0]}')
+
+    if max_num_point is not None:
+        max_num_point = int(max_num_point)
+        N = mesh_vertices.shape[0]
+        if N > max_num_point:
+            choices = np.random.choice(N, max_num_point, replace=False)
+            mesh_vertices = mesh_vertices[choices, :]
+            if not test_mode:
+                semantic_labels = semantic_labels[choices]
+                instance_labels = instance_labels[choices]
+    
+    mesh = o3d.io.read_triangle_mesh(mesh_file)
+    vertices = torch.from_numpy(np.array(mesh.vertices).astype(np.float32))
+    faces = torch.from_numpy(np.array(mesh.triangles).astype(np.int64))
+    superpoints = segmentator.segment_mesh(vertices, faces).numpy()
+    
+    np.save(f'{output_filename_prefix}_sp_label.npy', superpoints)
+    np.save(f'{output_filename_prefix}_vert.npy', mesh_vertices)
+
+    if not test_mode:
+        assert superpoints.shape == semantic_labels.shape
+        np.save(f'{output_filename_prefix}_sem_label.npy', semantic_labels)
+        np.save(f'{output_filename_prefix}_ins_label.npy', instance_labels)
+        np.save(f'{output_filename_prefix}_unaligned_bbox.npy',
+                unaligned_bboxes)
+        np.save(f'{output_filename_prefix}_aligned_bbox.npy', aligned_bboxes)
+        np.save(f'{output_filename_prefix}_axis_align_matrix.npy',
+                axis_align_matrix)
+
+
+def batch_export(max_num_point,
+                 output_folder,
+                 scan_names_file,
+                 label_map_file,
+                 scannet_dir,
+                 test_mode=False,
+                 scannet200=False):
+    if test_mode and not os.path.exists(scannet_dir):
+        # test data preparation is optional
+        return
+    if not os.path.exists(output_folder):
+        print(f'Creating new data folder: {output_folder}')
+        os.mkdir(output_folder)
+
+    scan_names = [line.rstrip() for line in open(scan_names_file)]
+    for scan_name in scan_names:
+        print('-' * 20 + 'begin')
+        print(datetime.datetime.now())
+        print(scan_name)
+        output_filename_prefix = osp.join(output_folder, scan_name)
+        if osp.isfile(f'{output_filename_prefix}_vert.npy'):
+            print('File already exists. skipping.')
+            print('-' * 20 + 'done')
+            continue
+        try:
+            export_one_scan(scan_name, output_filename_prefix, max_num_point,
+                            label_map_file, scannet_dir, test_mode, scannet200)
+        except Exception:
+            print(f'Failed export scan: {scan_name}')
+        print('-' * 20 + 'done')
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--max_num_point',
+        default=None,
+        help='The maximum number of the points.')
+    parser.add_argument(
+        '--output_folder',
+        default='./scannet_instance_data',
+        help='output folder of the result.')
+    parser.add_argument(
+        '--train_scannet_dir', default='scans', help='scannet data directory.')
+    parser.add_argument(
+        '--test_scannet_dir',
+        default='scans_test',
+        help='scannet data directory.')
+    parser.add_argument(
+        '--label_map_file',
+        default='meta_data/scannetv2-labels.combined.tsv',
+        help='The path of label map file.')
+    parser.add_argument(
+        '--train_scan_names_file',
+        default='meta_data/scannet_train.txt',
+        help='The path of the file that stores the scan names.')
+    parser.add_argument(
+        '--test_scan_names_file',
+        default='meta_data/scannetv2_test.txt',
+        help='The path of the file that stores the scan names.')
+    parser.add_argument(
+        '--scannet200',
+        action='store_true',
+        help='Use it for scannet200 mapping')
+    args = parser.parse_args()
+    batch_export(
+        args.max_num_point,
+        args.output_folder,
+        args.train_scan_names_file,
+        args.label_map_file,
+        args.train_scannet_dir,
+        test_mode=False,
+        scannet200=args.scannet200)
+    batch_export(
+        args.max_num_point,
+        args.output_folder,
+        args.test_scan_names_file,
+        args.label_map_file,
+        args.test_scannet_dir,
+        test_mode=True,
+        scannet200=args.scannet200)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/data/scannet/load_scannet_data.py b/data/scannet/load_scannet_data.py
new file mode 100644
index 0000000..7cbe499
--- /dev/null
+++ b/data/scannet/load_scannet_data.py
@@ -0,0 +1,205 @@
+# Modified from
+# https://github.com/facebookresearch/votenet/blob/master/scannet/load_scannet_data.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Load Scannet scenes with vertices and ground truth labels for semantic and
+instance segmentations."""
+import argparse
+import inspect
+import json
+import os
+
+import numpy as np
+import scannet_utils
+
+currentdir = os.path.dirname(
+    os.path.abspath(inspect.getfile(inspect.currentframe())))
+
+
+def read_aggregation(filename):
+    assert os.path.isfile(filename)
+    object_id_to_segs = {}
+    label_to_segs = {}
+    with open(filename) as f:
+        data = json.load(f)
+        num_objects = len(data['segGroups'])
+        for i in range(num_objects):
+            object_id = data['segGroups'][i][
+                'objectId'] + 1  # instance ids should be 1-indexed
+            label = data['segGroups'][i]['label']
+            segs = data['segGroups'][i]['segments']
+            object_id_to_segs[object_id] = segs
+            if label in label_to_segs:
+                label_to_segs[label].extend(segs)
+            else:
+                label_to_segs[label] = segs
+    return object_id_to_segs, label_to_segs
+
+
+def read_segmentation(filename):
+    assert os.path.isfile(filename)
+    seg_to_verts = {}
+    with open(filename) as f:
+        data = json.load(f)
+        num_verts = len(data['segIndices'])
+        for i in range(num_verts):
+            seg_id = data['segIndices'][i]
+            if seg_id in seg_to_verts:
+                seg_to_verts[seg_id].append(i)
+            else:
+                seg_to_verts[seg_id] = [i]
+    return seg_to_verts, num_verts
+
+
+def extract_bbox(mesh_vertices, object_id_to_segs, object_id_to_label_id,
+                 instance_ids):
+    num_instances = len(np.unique(list(object_id_to_segs.keys())))
+    instance_bboxes = np.zeros((num_instances, 7))
+    for obj_id in object_id_to_segs:
+        label_id = object_id_to_label_id[obj_id]
+        obj_pc = mesh_vertices[instance_ids == obj_id, 0:3]
+        if len(obj_pc) == 0:
+            continue
+        xyz_min = np.min(obj_pc, axis=0)
+        xyz_max = np.max(obj_pc, axis=0)
+        bbox = np.concatenate([(xyz_min + xyz_max) / 2.0, xyz_max - xyz_min,
+                               np.array([label_id])])
+        # NOTE: this assumes obj_id is in 1,2,3,.,,,.NUM_INSTANCES
+        instance_bboxes[obj_id - 1, :] = bbox
+    return instance_bboxes
+
+
+def export(mesh_file,
+           agg_file,
+           seg_file,
+           meta_file,
+           label_map_file,
+           output_file=None,
+           test_mode=False,
+           scannet200=False):
+    """Export original files to vert, ins_label, sem_label and bbox file.
+
+    Args:
+        mesh_file (str): Path of the mesh_file.
+        agg_file (str): Path of the agg_file.
+        seg_file (str): Path of the seg_file.
+        meta_file (str): Path of the meta_file.
+        label_map_file (str): Path of the label_map_file.
+        output_file (str): Path of the output folder.
+            Default: None.
+        test_mode (bool): Whether is generating test data without labels.
+            Default: False.
+
+    It returns a tuple, which contains the the following things:
+        np.ndarray: Vertices of points data.
+        np.ndarray: Indexes of label.
+        np.ndarray: Indexes of instance.
+        np.ndarray: Instance bboxes.
+        dict: Map from object_id to label_id.
+    """
+    if scannet200:
+        label_map = scannet_utils.read_label_mapping(
+            label_map_file, label_from='raw_category', label_to='id')
+    else:
+        label_map = scannet_utils.read_label_mapping(
+            label_map_file, label_from='raw_category', label_to='nyu40id')
+
+    mesh_vertices = scannet_utils.read_mesh_vertices_rgb(mesh_file)
+
+    # Load scene axis alignment matrix
+    lines = open(meta_file).readlines()
+    # test set data doesn't have align_matrix
+    axis_align_matrix = np.eye(4)
+    for line in lines:
+        if 'axisAlignment' in line:
+            axis_align_matrix = [
+                float(x)
+                for x in line.rstrip().strip('axisAlignment = ').split(' ')
+            ]
+            break
+    axis_align_matrix = np.array(axis_align_matrix).reshape((4, 4))
+
+    # perform global alignment of mesh vertices
+    pts = np.ones((mesh_vertices.shape[0], 4))
+    pts[:, 0:3] = mesh_vertices[:, 0:3]
+    pts = np.dot(pts, axis_align_matrix.transpose())  # Nx4
+    aligned_mesh_vertices = np.concatenate([pts[:, 0:3], mesh_vertices[:, 3:]],
+                                           axis=1)
+
+    # Load semantic and instance labels
+    if not test_mode:
+        object_id_to_segs, label_to_segs = read_aggregation(agg_file)
+        seg_to_verts, num_verts = read_segmentation(seg_file)
+        label_ids = np.zeros(shape=(num_verts), dtype=np.uint32)
+        object_id_to_label_id = {}
+        for label, segs in label_to_segs.items():
+            label_id = label_map[label]
+            for seg in segs:
+                verts = seg_to_verts[seg]
+                label_ids[verts] = label_id
+        instance_ids = np.zeros(
+            shape=(num_verts), dtype=np.uint32)  # 0: unannotated
+        for object_id, segs in object_id_to_segs.items():
+            for seg in segs:
+                verts = seg_to_verts[seg]
+                instance_ids[verts] = object_id
+                if object_id not in object_id_to_label_id:
+                    object_id_to_label_id[object_id] = label_ids[verts][0]
+        unaligned_bboxes = extract_bbox(mesh_vertices, object_id_to_segs,
+                                        object_id_to_label_id, instance_ids)
+        aligned_bboxes = extract_bbox(aligned_mesh_vertices, object_id_to_segs,
+                                      object_id_to_label_id, instance_ids)
+    else:
+        label_ids = None
+        instance_ids = None
+        unaligned_bboxes = None
+        aligned_bboxes = None
+        object_id_to_label_id = None
+
+    if output_file is not None:
+        np.save(output_file + '_vert.npy', mesh_vertices)
+        if not test_mode:
+            np.save(output_file + '_sem_label.npy', label_ids)
+            np.save(output_file + '_ins_label.npy', instance_ids)
+            np.save(output_file + '_unaligned_bbox.npy', unaligned_bboxes)
+            np.save(output_file + '_aligned_bbox.npy', aligned_bboxes)
+            np.save(output_file + '_axis_align_matrix.npy', axis_align_matrix)
+
+    return mesh_vertices, label_ids, instance_ids, unaligned_bboxes, \
+        aligned_bboxes, object_id_to_label_id, axis_align_matrix
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--scan_path',
+        required=True,
+        help='path to scannet scene (e.g., data/ScanNet/v2/scene0000_00')
+    parser.add_argument('--output_file', required=True, help='output file')
+    parser.add_argument(
+        '--label_map_file',
+        required=True,
+        help='path to scannetv2-labels.combined.tsv')
+    parser.add_argument(
+        '--scannet200',
+        action='store_true',
+        help='Use it for scannet200 mapping')
+
+    opt = parser.parse_args()
+
+    scan_name = os.path.split(opt.scan_path)[-1]
+    mesh_file = os.path.join(opt.scan_path, scan_name + '_vh_clean_2.ply')
+    agg_file = os.path.join(opt.scan_path, scan_name + '.aggregation.json')
+    seg_file = os.path.join(opt.scan_path,
+                            scan_name + '_vh_clean_2.0.010000.segs.json')
+    meta_file = os.path.join(
+        opt.scan_path, scan_name +
+        '.txt')  # includes axisAlignment info for the train set scans.
+    export(mesh_file, agg_file, seg_file, meta_file, opt.label_map_file,
+           opt.output_file, scannet200=opt.scannet200)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/data/scannet/meta_data/scannet_means.npz b/data/scannet/meta_data/scannet_means.npz
new file mode 100644
index 0000000..e57647c
Binary files /dev/null and b/data/scannet/meta_data/scannet_means.npz differ
diff --git a/data/scannet/meta_data/scannet_train.txt b/data/scannet/meta_data/scannet_train.txt
new file mode 100644
index 0000000..d6f5e8a
--- /dev/null
+++ b/data/scannet/meta_data/scannet_train.txt
@@ -0,0 +1,1513 @@
+scene0000_00
+scene0000_01
+scene0000_02
+scene0001_00
+scene0001_01
+scene0002_00
+scene0002_01
+scene0003_00
+scene0003_01
+scene0003_02
+scene0004_00
+scene0005_00
+scene0005_01
+scene0006_00
+scene0006_01
+scene0006_02
+scene0007_00
+scene0008_00
+scene0009_00
+scene0009_01
+scene0009_02
+scene0010_00
+scene0010_01
+scene0011_00
+scene0011_01
+scene0012_00
+scene0012_01
+scene0012_02
+scene0013_00
+scene0013_01
+scene0013_02
+scene0014_00
+scene0015_00
+scene0016_00
+scene0016_01
+scene0016_02
+scene0017_00
+scene0017_01
+scene0017_02
+scene0018_00
+scene0019_00
+scene0019_01
+scene0020_00
+scene0020_01
+scene0021_00
+scene0022_00
+scene0022_01
+scene0023_00
+scene0024_00
+scene0024_01
+scene0024_02
+scene0025_00
+scene0025_01
+scene0025_02
+scene0026_00
+scene0027_00
+scene0027_01
+scene0027_02
+scene0028_00
+scene0029_00
+scene0029_01
+scene0029_02
+scene0030_00
+scene0030_01
+scene0030_02
+scene0031_00
+scene0031_01
+scene0031_02
+scene0032_00
+scene0032_01
+scene0033_00
+scene0034_00
+scene0034_01
+scene0034_02
+scene0035_00
+scene0035_01
+scene0036_00
+scene0036_01
+scene0037_00
+scene0038_00
+scene0038_01
+scene0038_02
+scene0039_00
+scene0039_01
+scene0040_00
+scene0040_01
+scene0041_00
+scene0041_01
+scene0042_00
+scene0042_01
+scene0042_02
+scene0043_00
+scene0043_01
+scene0044_00
+scene0044_01
+scene0044_02
+scene0045_00
+scene0045_01
+scene0046_00
+scene0046_01
+scene0046_02
+scene0047_00
+scene0048_00
+scene0048_01
+scene0049_00
+scene0050_00
+scene0050_01
+scene0050_02
+scene0051_00
+scene0051_01
+scene0051_02
+scene0051_03
+scene0052_00
+scene0052_01
+scene0052_02
+scene0053_00
+scene0054_00
+scene0055_00
+scene0055_01
+scene0055_02
+scene0056_00
+scene0056_01
+scene0057_00
+scene0057_01
+scene0058_00
+scene0058_01
+scene0059_00
+scene0059_01
+scene0059_02
+scene0060_00
+scene0060_01
+scene0061_00
+scene0061_01
+scene0062_00
+scene0062_01
+scene0062_02
+scene0063_00
+scene0064_00
+scene0064_01
+scene0065_00
+scene0065_01
+scene0065_02
+scene0066_00
+scene0067_00
+scene0067_01
+scene0067_02
+scene0068_00
+scene0068_01
+scene0069_00
+scene0070_00
+scene0071_00
+scene0072_00
+scene0072_01
+scene0072_02
+scene0073_00
+scene0073_01
+scene0073_02
+scene0073_03
+scene0074_00
+scene0074_01
+scene0074_02
+scene0075_00
+scene0076_00
+scene0077_00
+scene0077_01
+scene0078_00
+scene0078_01
+scene0078_02
+scene0079_00
+scene0079_01
+scene0080_00
+scene0080_01
+scene0080_02
+scene0081_00
+scene0081_01
+scene0081_02
+scene0082_00
+scene0083_00
+scene0083_01
+scene0084_00
+scene0084_01
+scene0084_02
+scene0085_00
+scene0085_01
+scene0086_00
+scene0086_01
+scene0086_02
+scene0087_00
+scene0087_01
+scene0087_02
+scene0088_00
+scene0088_01
+scene0088_02
+scene0088_03
+scene0089_00
+scene0089_01
+scene0089_02
+scene0090_00
+scene0091_00
+scene0092_00
+scene0092_01
+scene0092_02
+scene0092_03
+scene0092_04
+scene0093_00
+scene0093_01
+scene0093_02
+scene0094_00
+scene0095_00
+scene0095_01
+scene0096_00
+scene0096_01
+scene0096_02
+scene0097_00
+scene0098_00
+scene0098_01
+scene0099_00
+scene0099_01
+scene0100_00
+scene0100_01
+scene0100_02
+scene0101_00
+scene0101_01
+scene0101_02
+scene0101_03
+scene0101_04
+scene0101_05
+scene0102_00
+scene0102_01
+scene0103_00
+scene0103_01
+scene0104_00
+scene0105_00
+scene0105_01
+scene0105_02
+scene0106_00
+scene0106_01
+scene0106_02
+scene0107_00
+scene0108_00
+scene0109_00
+scene0109_01
+scene0110_00
+scene0110_01
+scene0110_02
+scene0111_00
+scene0111_01
+scene0111_02
+scene0112_00
+scene0112_01
+scene0112_02
+scene0113_00
+scene0113_01
+scene0114_00
+scene0114_01
+scene0114_02
+scene0115_00
+scene0115_01
+scene0115_02
+scene0116_00
+scene0116_01
+scene0116_02
+scene0117_00
+scene0118_00
+scene0118_01
+scene0118_02
+scene0119_00
+scene0120_00
+scene0120_01
+scene0121_00
+scene0121_01
+scene0121_02
+scene0122_00
+scene0122_01
+scene0123_00
+scene0123_01
+scene0123_02
+scene0124_00
+scene0124_01
+scene0125_00
+scene0126_00
+scene0126_01
+scene0126_02
+scene0127_00
+scene0127_01
+scene0128_00
+scene0129_00
+scene0130_00
+scene0131_00
+scene0131_01
+scene0131_02
+scene0132_00
+scene0132_01
+scene0132_02
+scene0133_00
+scene0134_00
+scene0134_01
+scene0134_02
+scene0135_00
+scene0136_00
+scene0136_01
+scene0136_02
+scene0137_00
+scene0137_01
+scene0137_02
+scene0138_00
+scene0139_00
+scene0140_00
+scene0140_01
+scene0141_00
+scene0141_01
+scene0141_02
+scene0142_00
+scene0142_01
+scene0143_00
+scene0143_01
+scene0143_02
+scene0144_00
+scene0144_01
+scene0145_00
+scene0146_00
+scene0146_01
+scene0146_02
+scene0147_00
+scene0147_01
+scene0148_00
+scene0149_00
+scene0150_00
+scene0150_01
+scene0150_02
+scene0151_00
+scene0151_01
+scene0152_00
+scene0152_01
+scene0152_02
+scene0153_00
+scene0153_01
+scene0154_00
+scene0155_00
+scene0155_01
+scene0155_02
+scene0156_00
+scene0157_00
+scene0157_01
+scene0158_00
+scene0158_01
+scene0158_02
+scene0159_00
+scene0160_00
+scene0160_01
+scene0160_02
+scene0160_03
+scene0160_04
+scene0161_00
+scene0161_01
+scene0161_02
+scene0162_00
+scene0163_00
+scene0163_01
+scene0164_00
+scene0164_01
+scene0164_02
+scene0164_03
+scene0165_00
+scene0165_01
+scene0165_02
+scene0166_00
+scene0166_01
+scene0166_02
+scene0167_00
+scene0168_00
+scene0168_01
+scene0168_02
+scene0169_00
+scene0169_01
+scene0170_00
+scene0170_01
+scene0170_02
+scene0171_00
+scene0171_01
+scene0172_00
+scene0172_01
+scene0173_00
+scene0173_01
+scene0173_02
+scene0174_00
+scene0174_01
+scene0175_00
+scene0176_00
+scene0177_00
+scene0177_01
+scene0177_02
+scene0178_00
+scene0179_00
+scene0180_00
+scene0181_00
+scene0181_01
+scene0181_02
+scene0181_03
+scene0182_00
+scene0182_01
+scene0182_02
+scene0183_00
+scene0184_00
+scene0185_00
+scene0186_00
+scene0186_01
+scene0187_00
+scene0187_01
+scene0188_00
+scene0189_00
+scene0190_00
+scene0191_00
+scene0191_01
+scene0191_02
+scene0192_00
+scene0192_01
+scene0192_02
+scene0193_00
+scene0193_01
+scene0194_00
+scene0195_00
+scene0195_01
+scene0195_02
+scene0196_00
+scene0197_00
+scene0197_01
+scene0197_02
+scene0198_00
+scene0199_00
+scene0200_00
+scene0200_01
+scene0200_02
+scene0201_00
+scene0201_01
+scene0201_02
+scene0202_00
+scene0203_00
+scene0203_01
+scene0203_02
+scene0204_00
+scene0204_01
+scene0204_02
+scene0205_00
+scene0205_01
+scene0205_02
+scene0206_00
+scene0206_01
+scene0206_02
+scene0207_00
+scene0207_01
+scene0207_02
+scene0208_00
+scene0209_00
+scene0209_01
+scene0209_02
+scene0210_00
+scene0210_01
+scene0211_00
+scene0211_01
+scene0211_02
+scene0211_03
+scene0212_00
+scene0212_01
+scene0212_02
+scene0213_00
+scene0214_00
+scene0214_01
+scene0214_02
+scene0215_00
+scene0215_01
+scene0216_00
+scene0217_00
+scene0218_00
+scene0218_01
+scene0219_00
+scene0220_00
+scene0220_01
+scene0220_02
+scene0221_00
+scene0221_01
+scene0222_00
+scene0222_01
+scene0223_00
+scene0223_01
+scene0223_02
+scene0224_00
+scene0225_00
+scene0226_00
+scene0226_01
+scene0227_00
+scene0228_00
+scene0229_00
+scene0229_01
+scene0229_02
+scene0230_00
+scene0231_00
+scene0231_01
+scene0231_02
+scene0232_00
+scene0232_01
+scene0232_02
+scene0233_00
+scene0233_01
+scene0234_00
+scene0235_00
+scene0236_00
+scene0236_01
+scene0237_00
+scene0237_01
+scene0238_00
+scene0238_01
+scene0239_00
+scene0239_01
+scene0239_02
+scene0240_00
+scene0241_00
+scene0241_01
+scene0241_02
+scene0242_00
+scene0242_01
+scene0242_02
+scene0243_00
+scene0244_00
+scene0244_01
+scene0245_00
+scene0246_00
+scene0247_00
+scene0247_01
+scene0248_00
+scene0248_01
+scene0248_02
+scene0249_00
+scene0250_00
+scene0250_01
+scene0250_02
+scene0251_00
+scene0252_00
+scene0253_00
+scene0254_00
+scene0254_01
+scene0255_00
+scene0255_01
+scene0255_02
+scene0256_00
+scene0256_01
+scene0256_02
+scene0257_00
+scene0258_00
+scene0259_00
+scene0259_01
+scene0260_00
+scene0260_01
+scene0260_02
+scene0261_00
+scene0261_01
+scene0261_02
+scene0261_03
+scene0262_00
+scene0262_01
+scene0263_00
+scene0263_01
+scene0264_00
+scene0264_01
+scene0264_02
+scene0265_00
+scene0265_01
+scene0265_02
+scene0266_00
+scene0266_01
+scene0267_00
+scene0268_00
+scene0268_01
+scene0268_02
+scene0269_00
+scene0269_01
+scene0269_02
+scene0270_00
+scene0270_01
+scene0270_02
+scene0271_00
+scene0271_01
+scene0272_00
+scene0272_01
+scene0273_00
+scene0273_01
+scene0274_00
+scene0274_01
+scene0274_02
+scene0275_00
+scene0276_00
+scene0276_01
+scene0277_00
+scene0277_01
+scene0277_02
+scene0278_00
+scene0278_01
+scene0279_00
+scene0279_01
+scene0279_02
+scene0280_00
+scene0280_01
+scene0280_02
+scene0281_00
+scene0282_00
+scene0282_01
+scene0282_02
+scene0283_00
+scene0284_00
+scene0285_00
+scene0286_00
+scene0286_01
+scene0286_02
+scene0286_03
+scene0287_00
+scene0288_00
+scene0288_01
+scene0288_02
+scene0289_00
+scene0289_01
+scene0290_00
+scene0291_00
+scene0291_01
+scene0291_02
+scene0292_00
+scene0292_01
+scene0293_00
+scene0293_01
+scene0294_00
+scene0294_01
+scene0294_02
+scene0295_00
+scene0295_01
+scene0296_00
+scene0296_01
+scene0297_00
+scene0297_01
+scene0297_02
+scene0298_00
+scene0299_00
+scene0299_01
+scene0300_00
+scene0300_01
+scene0301_00
+scene0301_01
+scene0301_02
+scene0302_00
+scene0302_01
+scene0303_00
+scene0303_01
+scene0303_02
+scene0304_00
+scene0305_00
+scene0305_01
+scene0306_00
+scene0306_01
+scene0307_00
+scene0307_01
+scene0307_02
+scene0308_00
+scene0309_00
+scene0309_01
+scene0310_00
+scene0310_01
+scene0310_02
+scene0311_00
+scene0312_00
+scene0312_01
+scene0312_02
+scene0313_00
+scene0313_01
+scene0313_02
+scene0314_00
+scene0315_00
+scene0316_00
+scene0317_00
+scene0317_01
+scene0318_00
+scene0319_00
+scene0320_00
+scene0320_01
+scene0320_02
+scene0320_03
+scene0321_00
+scene0322_00
+scene0323_00
+scene0323_01
+scene0324_00
+scene0324_01
+scene0325_00
+scene0325_01
+scene0326_00
+scene0327_00
+scene0328_00
+scene0329_00
+scene0329_01
+scene0329_02
+scene0330_00
+scene0331_00
+scene0331_01
+scene0332_00
+scene0332_01
+scene0332_02
+scene0333_00
+scene0334_00
+scene0334_01
+scene0334_02
+scene0335_00
+scene0335_01
+scene0335_02
+scene0336_00
+scene0336_01
+scene0337_00
+scene0337_01
+scene0337_02
+scene0338_00
+scene0338_01
+scene0338_02
+scene0339_00
+scene0340_00
+scene0340_01
+scene0340_02
+scene0341_00
+scene0341_01
+scene0342_00
+scene0343_00
+scene0344_00
+scene0344_01
+scene0345_00
+scene0345_01
+scene0346_00
+scene0346_01
+scene0347_00
+scene0347_01
+scene0347_02
+scene0348_00
+scene0348_01
+scene0348_02
+scene0349_00
+scene0349_01
+scene0350_00
+scene0350_01
+scene0350_02
+scene0351_00
+scene0351_01
+scene0352_00
+scene0352_01
+scene0352_02
+scene0353_00
+scene0353_01
+scene0353_02
+scene0354_00
+scene0355_00
+scene0355_01
+scene0356_00
+scene0356_01
+scene0356_02
+scene0357_00
+scene0357_01
+scene0358_00
+scene0358_01
+scene0358_02
+scene0359_00
+scene0359_01
+scene0360_00
+scene0361_00
+scene0361_01
+scene0361_02
+scene0362_00
+scene0362_01
+scene0362_02
+scene0362_03
+scene0363_00
+scene0364_00
+scene0364_01
+scene0365_00
+scene0365_01
+scene0365_02
+scene0366_00
+scene0367_00
+scene0367_01
+scene0368_00
+scene0368_01
+scene0369_00
+scene0369_01
+scene0369_02
+scene0370_00
+scene0370_01
+scene0370_02
+scene0371_00
+scene0371_01
+scene0372_00
+scene0373_00
+scene0373_01
+scene0374_00
+scene0375_00
+scene0375_01
+scene0375_02
+scene0376_00
+scene0376_01
+scene0376_02
+scene0377_00
+scene0377_01
+scene0377_02
+scene0378_00
+scene0378_01
+scene0378_02
+scene0379_00
+scene0380_00
+scene0380_01
+scene0380_02
+scene0381_00
+scene0381_01
+scene0381_02
+scene0382_00
+scene0382_01
+scene0383_00
+scene0383_01
+scene0383_02
+scene0384_00
+scene0385_00
+scene0385_01
+scene0385_02
+scene0386_00
+scene0387_00
+scene0387_01
+scene0387_02
+scene0388_00
+scene0388_01
+scene0389_00
+scene0390_00
+scene0391_00
+scene0392_00
+scene0392_01
+scene0392_02
+scene0393_00
+scene0393_01
+scene0393_02
+scene0394_00
+scene0394_01
+scene0395_00
+scene0395_01
+scene0395_02
+scene0396_00
+scene0396_01
+scene0396_02
+scene0397_00
+scene0397_01
+scene0398_00
+scene0398_01
+scene0399_00
+scene0399_01
+scene0400_00
+scene0400_01
+scene0401_00
+scene0402_00
+scene0403_00
+scene0403_01
+scene0404_00
+scene0404_01
+scene0404_02
+scene0405_00
+scene0406_00
+scene0406_01
+scene0406_02
+scene0407_00
+scene0407_01
+scene0408_00
+scene0408_01
+scene0409_00
+scene0409_01
+scene0410_00
+scene0410_01
+scene0411_00
+scene0411_01
+scene0411_02
+scene0412_00
+scene0412_01
+scene0413_00
+scene0414_00
+scene0415_00
+scene0415_01
+scene0415_02
+scene0416_00
+scene0416_01
+scene0416_02
+scene0416_03
+scene0416_04
+scene0417_00
+scene0418_00
+scene0418_01
+scene0418_02
+scene0419_00
+scene0419_01
+scene0419_02
+scene0420_00
+scene0420_01
+scene0420_02
+scene0421_00
+scene0421_01
+scene0421_02
+scene0422_00
+scene0423_00
+scene0423_01
+scene0423_02
+scene0424_00
+scene0424_01
+scene0424_02
+scene0425_00
+scene0425_01
+scene0426_00
+scene0426_01
+scene0426_02
+scene0426_03
+scene0427_00
+scene0428_00
+scene0428_01
+scene0429_00
+scene0430_00
+scene0430_01
+scene0431_00
+scene0432_00
+scene0432_01
+scene0433_00
+scene0434_00
+scene0434_01
+scene0434_02
+scene0435_00
+scene0435_01
+scene0435_02
+scene0435_03
+scene0436_00
+scene0437_00
+scene0437_01
+scene0438_00
+scene0439_00
+scene0439_01
+scene0440_00
+scene0440_01
+scene0440_02
+scene0441_00
+scene0442_00
+scene0443_00
+scene0444_00
+scene0444_01
+scene0445_00
+scene0445_01
+scene0446_00
+scene0446_01
+scene0447_00
+scene0447_01
+scene0447_02
+scene0448_00
+scene0448_01
+scene0448_02
+scene0449_00
+scene0449_01
+scene0449_02
+scene0450_00
+scene0451_00
+scene0451_01
+scene0451_02
+scene0451_03
+scene0451_04
+scene0451_05
+scene0452_00
+scene0452_01
+scene0452_02
+scene0453_00
+scene0453_01
+scene0454_00
+scene0455_00
+scene0456_00
+scene0456_01
+scene0457_00
+scene0457_01
+scene0457_02
+scene0458_00
+scene0458_01
+scene0459_00
+scene0459_01
+scene0460_00
+scene0461_00
+scene0462_00
+scene0463_00
+scene0463_01
+scene0464_00
+scene0465_00
+scene0465_01
+scene0466_00
+scene0466_01
+scene0467_00
+scene0468_00
+scene0468_01
+scene0468_02
+scene0469_00
+scene0469_01
+scene0469_02
+scene0470_00
+scene0470_01
+scene0471_00
+scene0471_01
+scene0471_02
+scene0472_00
+scene0472_01
+scene0472_02
+scene0473_00
+scene0473_01
+scene0474_00
+scene0474_01
+scene0474_02
+scene0474_03
+scene0474_04
+scene0474_05
+scene0475_00
+scene0475_01
+scene0475_02
+scene0476_00
+scene0476_01
+scene0476_02
+scene0477_00
+scene0477_01
+scene0478_00
+scene0478_01
+scene0479_00
+scene0479_01
+scene0479_02
+scene0480_00
+scene0480_01
+scene0481_00
+scene0481_01
+scene0482_00
+scene0482_01
+scene0483_00
+scene0484_00
+scene0484_01
+scene0485_00
+scene0486_00
+scene0487_00
+scene0487_01
+scene0488_00
+scene0488_01
+scene0489_00
+scene0489_01
+scene0489_02
+scene0490_00
+scene0491_00
+scene0492_00
+scene0492_01
+scene0493_00
+scene0493_01
+scene0494_00
+scene0495_00
+scene0496_00
+scene0497_00
+scene0498_00
+scene0498_01
+scene0498_02
+scene0499_00
+scene0500_00
+scene0500_01
+scene0501_00
+scene0501_01
+scene0501_02
+scene0502_00
+scene0502_01
+scene0502_02
+scene0503_00
+scene0504_00
+scene0505_00
+scene0505_01
+scene0505_02
+scene0505_03
+scene0505_04
+scene0506_00
+scene0507_00
+scene0508_00
+scene0508_01
+scene0508_02
+scene0509_00
+scene0509_01
+scene0509_02
+scene0510_00
+scene0510_01
+scene0510_02
+scene0511_00
+scene0511_01
+scene0512_00
+scene0513_00
+scene0514_00
+scene0514_01
+scene0515_00
+scene0515_01
+scene0515_02
+scene0516_00
+scene0516_01
+scene0517_00
+scene0517_01
+scene0517_02
+scene0518_00
+scene0519_00
+scene0520_00
+scene0520_01
+scene0521_00
+scene0522_00
+scene0523_00
+scene0523_01
+scene0523_02
+scene0524_00
+scene0524_01
+scene0525_00
+scene0525_01
+scene0525_02
+scene0526_00
+scene0526_01
+scene0527_00
+scene0528_00
+scene0528_01
+scene0529_00
+scene0529_01
+scene0529_02
+scene0530_00
+scene0531_00
+scene0532_00
+scene0532_01
+scene0533_00
+scene0533_01
+scene0534_00
+scene0534_01
+scene0535_00
+scene0536_00
+scene0536_01
+scene0536_02
+scene0537_00
+scene0538_00
+scene0539_00
+scene0539_01
+scene0539_02
+scene0540_00
+scene0540_01
+scene0540_02
+scene0541_00
+scene0541_01
+scene0541_02
+scene0542_00
+scene0543_00
+scene0543_01
+scene0543_02
+scene0544_00
+scene0545_00
+scene0545_01
+scene0545_02
+scene0546_00
+scene0547_00
+scene0547_01
+scene0547_02
+scene0548_00
+scene0548_01
+scene0548_02
+scene0549_00
+scene0549_01
+scene0550_00
+scene0551_00
+scene0552_00
+scene0552_01
+scene0553_00
+scene0553_01
+scene0553_02
+scene0554_00
+scene0554_01
+scene0555_00
+scene0556_00
+scene0556_01
+scene0557_00
+scene0557_01
+scene0557_02
+scene0558_00
+scene0558_01
+scene0558_02
+scene0559_00
+scene0559_01
+scene0559_02
+scene0560_00
+scene0561_00
+scene0561_01
+scene0562_00
+scene0563_00
+scene0564_00
+scene0565_00
+scene0566_00
+scene0567_00
+scene0567_01
+scene0568_00
+scene0568_01
+scene0568_02
+scene0569_00
+scene0569_01
+scene0570_00
+scene0570_01
+scene0570_02
+scene0571_00
+scene0571_01
+scene0572_00
+scene0572_01
+scene0572_02
+scene0573_00
+scene0573_01
+scene0574_00
+scene0574_01
+scene0574_02
+scene0575_00
+scene0575_01
+scene0575_02
+scene0576_00
+scene0576_01
+scene0576_02
+scene0577_00
+scene0578_00
+scene0578_01
+scene0578_02
+scene0579_00
+scene0579_01
+scene0579_02
+scene0580_00
+scene0580_01
+scene0581_00
+scene0581_01
+scene0581_02
+scene0582_00
+scene0582_01
+scene0582_02
+scene0583_00
+scene0583_01
+scene0583_02
+scene0584_00
+scene0584_01
+scene0584_02
+scene0585_00
+scene0585_01
+scene0586_00
+scene0586_01
+scene0586_02
+scene0587_00
+scene0587_01
+scene0587_02
+scene0587_03
+scene0588_00
+scene0588_01
+scene0588_02
+scene0588_03
+scene0589_00
+scene0589_01
+scene0589_02
+scene0590_00
+scene0590_01
+scene0591_00
+scene0591_01
+scene0591_02
+scene0592_00
+scene0592_01
+scene0593_00
+scene0593_01
+scene0594_00
+scene0595_00
+scene0596_00
+scene0596_01
+scene0596_02
+scene0597_00
+scene0597_01
+scene0597_02
+scene0598_00
+scene0598_01
+scene0598_02
+scene0599_00
+scene0599_01
+scene0599_02
+scene0600_00
+scene0600_01
+scene0600_02
+scene0601_00
+scene0601_01
+scene0602_00
+scene0603_00
+scene0603_01
+scene0604_00
+scene0604_01
+scene0604_02
+scene0605_00
+scene0605_01
+scene0606_00
+scene0606_01
+scene0606_02
+scene0607_00
+scene0607_01
+scene0608_00
+scene0608_01
+scene0608_02
+scene0609_00
+scene0609_01
+scene0609_02
+scene0609_03
+scene0610_00
+scene0610_01
+scene0610_02
+scene0611_00
+scene0611_01
+scene0612_00
+scene0612_01
+scene0613_00
+scene0613_01
+scene0613_02
+scene0614_00
+scene0614_01
+scene0614_02
+scene0615_00
+scene0615_01
+scene0616_00
+scene0616_01
+scene0617_00
+scene0618_00
+scene0619_00
+scene0620_00
+scene0620_01
+scene0621_00
+scene0622_00
+scene0622_01
+scene0623_00
+scene0623_01
+scene0624_00
+scene0625_00
+scene0625_01
+scene0626_00
+scene0626_01
+scene0626_02
+scene0627_00
+scene0627_01
+scene0628_00
+scene0628_01
+scene0628_02
+scene0629_00
+scene0629_01
+scene0629_02
+scene0630_00
+scene0630_01
+scene0630_02
+scene0630_03
+scene0630_04
+scene0630_05
+scene0630_06
+scene0631_00
+scene0631_01
+scene0631_02
+scene0632_00
+scene0633_00
+scene0633_01
+scene0634_00
+scene0635_00
+scene0635_01
+scene0636_00
+scene0637_00
+scene0638_00
+scene0639_00
+scene0640_00
+scene0640_01
+scene0640_02
+scene0641_00
+scene0642_00
+scene0642_01
+scene0642_02
+scene0642_03
+scene0643_00
+scene0644_00
+scene0645_00
+scene0645_01
+scene0645_02
+scene0646_00
+scene0646_01
+scene0646_02
+scene0647_00
+scene0647_01
+scene0648_00
+scene0648_01
+scene0649_00
+scene0649_01
+scene0650_00
+scene0651_00
+scene0651_01
+scene0651_02
+scene0652_00
+scene0653_00
+scene0653_01
+scene0654_00
+scene0654_01
+scene0655_00
+scene0655_01
+scene0655_02
+scene0656_00
+scene0656_01
+scene0656_02
+scene0656_03
+scene0657_00
+scene0658_00
+scene0659_00
+scene0659_01
+scene0660_00
+scene0661_00
+scene0662_00
+scene0662_01
+scene0662_02
+scene0663_00
+scene0663_01
+scene0663_02
+scene0664_00
+scene0664_01
+scene0664_02
+scene0665_00
+scene0665_01
+scene0666_00
+scene0666_01
+scene0666_02
+scene0667_00
+scene0667_01
+scene0667_02
+scene0668_00
+scene0669_00
+scene0669_01
+scene0670_00
+scene0670_01
+scene0671_00
+scene0671_01
+scene0672_00
+scene0672_01
+scene0673_00
+scene0673_01
+scene0673_02
+scene0673_03
+scene0673_04
+scene0673_05
+scene0674_00
+scene0674_01
+scene0675_00
+scene0675_01
+scene0676_00
+scene0676_01
+scene0677_00
+scene0677_01
+scene0677_02
+scene0678_00
+scene0678_01
+scene0678_02
+scene0679_00
+scene0679_01
+scene0680_00
+scene0680_01
+scene0681_00
+scene0682_00
+scene0683_00
+scene0684_00
+scene0684_01
+scene0685_00
+scene0685_01
+scene0685_02
+scene0686_00
+scene0686_01
+scene0686_02
+scene0687_00
+scene0688_00
+scene0689_00
+scene0690_00
+scene0690_01
+scene0691_00
+scene0691_01
+scene0692_00
+scene0692_01
+scene0692_02
+scene0692_03
+scene0692_04
+scene0693_00
+scene0693_01
+scene0693_02
+scene0694_00
+scene0694_01
+scene0695_00
+scene0695_01
+scene0695_02
+scene0695_03
+scene0696_00
+scene0696_01
+scene0696_02
+scene0697_00
+scene0697_01
+scene0697_02
+scene0697_03
+scene0698_00
+scene0698_01
+scene0699_00
+scene0700_00
+scene0700_01
+scene0700_02
+scene0701_00
+scene0701_01
+scene0701_02
+scene0702_00
+scene0702_01
+scene0702_02
+scene0703_00
+scene0703_01
+scene0704_00
+scene0704_01
+scene0705_00
+scene0705_01
+scene0705_02
+scene0706_00
diff --git a/data/scannet/meta_data/scannetv2-labels.combined.tsv b/data/scannet/meta_data/scannetv2-labels.combined.tsv
new file mode 100644
index 0000000..b2ad275
--- /dev/null
+++ b/data/scannet/meta_data/scannetv2-labels.combined.tsv
@@ -0,0 +1,608 @@
+id	raw_category	category	count	nyu40id	eigen13id	nyuClass	nyu40class	eigen13class	ModelNet40	ModelNet10	ShapeNetCore55	synsetoffset	wnsynsetid	wnsynsetkey	mpcat40	mpcat40index
+1	wall	wall	8277	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+2	chair	chair	4646	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+22	books	book	1678	23	2	book	books	Books					n02870526	book.n.11	objects	39
+3	floor	floor	1553	2	5	floor	floor	Floor					n03365592	floor.n.01	floor	2
+5	door	door	1483	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+1163	object	object	1313	40	7		otherprop	Objects							objects	39
+16	window	window	1209	9	13	window	window	Window					n04587648	window.n.01	window	9
+4	table	table	1170	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+56	trash can	trash can	1090	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+13	pillow	pillow	937	18	7	pillow	pillow	Objects			pillow	3938244	n03938244	pillow.n.01	cushion	8
+15	picture	picture	862	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+41	ceiling	ceiling	806	22	3	ceiling	ceiling	Ceiling					n02990373	ceiling.n.01	ceiling	17
+26	box	box	775	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+161	doorframe	doorframe	768	8	12	door	door	Wall	door					doorframe.n.01	door	4
+19	monitor	monitor	765	40	7	monitor	otherprop	Objects	monitor	monitor	tv or monitor	3211117	n03782190	monitor.n.04	objects	39
+7	cabinet	cabinet	731	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+9	desk	desk	680	14	10	desk	desk	Table	desk	desk	table	4379243	n03179701	desk.n.01	table	5
+8	shelf	shelf	641	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+10	office chair	office chair	595	5	4	chair	chair	Chair	chair	chair	chair	3001627	n04373704	swivel_chair.n.01	chair	3
+31	towel	towel	570	27	7	towel	towel	Objects					n04459362	towel.n.01	towel	20
+6	couch	couch	502	6	9	sofa	sofa	Sofa	sofa	sofa	sofa	4256520	n04256520	sofa.n.01	sofa	10
+14	sink	sink	488	34	7	sink	sink	Objects	sink				n04223580	sink.n.01	sink	15
+48	backpack	backpack	479	40	7	backpack	otherprop	Objects					n02769748	backpack.n.01	objects	39
+28	lamp	lamp	419	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+11	bed	bed	370	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+18	bookshelf	bookshelf	360	10	6	bookshelf	bookshelf	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+71	mirror	mirror	349	19	7	mirror	mirror	Objects					n03773035	mirror.n.01	mirror	21
+21	curtain	curtain	347	16	13	curtain	curtain	Window	curtain				n03151077	curtain.n.01	curtain	12
+40	plant	plant	331	40	7	plant	otherprop	Objects	plant				n00017222	plant.n.02	plant	14
+52	whiteboard	whiteboard	327	30	7	whiteboard	whiteboard	Objects					n03211616	display_panel.n.01	board_panel	35
+96	radiator	radiator	322	39	6	radiator	otherfurniture	Furniture					n04041069	radiator.n.02	misc	40
+22	book	book	318	23	2	book	books	Books					n02870526	book.n.11	objects	39
+29	kitchen cabinet	kitchen cabinet	310	3	6	cabinet	cabinet	Furniture					n02933112	cabinet.n.01	cabinet	7
+49	toilet paper	toilet paper	291	40	7	toilet paper	otherprop	Objects					n15075141	toilet_tissue.n.01	objects	39
+29	kitchen cabinets	kitchen cabinet	289	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+23	armchair	armchair	281	5	4	chair	chair	Chair	chair	chair	chair	3001627	n02738535	armchair.n.01	chair	3
+63	shoes	shoe	272	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+24	coffee table	coffee table	258	7	10	coffee table	table	Table	table	table	table	4379243	n03063968	coffee_table.n.01	table	5
+17	toilet	toilet	256	33	7	toilet	toilet	Objects	toilet	toilet			n04446276	toilet.n.01	toilet	18
+47	bag	bag	252	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+32	clothes	clothes	248	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+46	keyboard	keyboard	246	40	7	keyboard	otherprop	Objects	keyboard		computer keyboard	3085013	n03085013	computer_keyboard.n.01	objects	39
+65	bottle	bottle	226	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+97	recycling bin	recycling bin	225	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+34	nightstand	nightstand	224	32	6	night stand	night stand	Furniture	night_stand	night_stand			n03015254	chest_of_drawers.n.01	chest_of_drawers	13
+38	stool	stool	221	40	7	stool	otherprop	Objects	stool				n04326896	stool.n.01	stool	19
+33	tv	tv	219	25	11	television	television	TV			tv or monitor	3211117	n03211117	display.n.06	tv_monitor	22
+75	file cabinet	file cabinet	217	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+36	dresser	dresser	213	17	6	dresser	dresser	Furniture	dresser	dresser			n03015254	chest_of_drawers.n.01	chest_of_drawers	13
+64	computer tower	computer tower	203	40	7	computer	otherprop	Objects					n03082979	computer.n.01	objects	39
+32	clothing	clothes	165	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+101	telephone	telephone	164	40	7	telephone	otherprop	Objects			telephone	4401088	n04401088	telephone.n.01	objects	39
+130	cup	cup	157	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+27	refrigerator	refrigerator	154	24	6	refridgerator	refridgerator	Furniture					n04070727	refrigerator.n.01	appliances	37
+44	end table	end table	147	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+131	jacket	jacket	146	40	7	jacket	otherprop	Objects					n03589791	jacket.n.01	clothes	38
+55	shower curtain	shower curtain	144	28	7	shower curtain	shower curtain	Objects	curtain				n04209239	shower_curtain.n.01	curtain	12
+42	bathtub	bathtub	144	36	7	bathtub	bathtub	Objects	bathtub	bathtub	tub	2808440	n02808440	bathtub.n.01	bathtub	25
+59	microwave	microwave	141	40	7	microwave	otherprop	Objects			microwave	3761084	n03761084	microwave.n.02	appliances	37
+159	kitchen counter	kitchen counter	140	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+74	sofa chair	sofa chair	129	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+82	paper towel dispenser	paper towel dispenser	129	40	7	paper towel dispenser	otherprop	Objects							objects	39
+1164	bathroom vanity	bathroom vanity	126	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	table	5
+93	suitcase	suitcase	118	40	7	luggage	otherprop	Objects					n02773838	bag.n.06	objects	39
+77	laptop	laptop	111	40	7	laptop	otherprop	Objects	laptop		laptop	3642806	n03642806	laptop.n.01	objects	39
+67	ottoman	ottoman	111	39	6	ottoman	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+128	shower walls	shower wall	109	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+50	printer	printer	106	40	7	printer	otherprop	Objects			printer	4004475	n04004475	printer.n.03	appliances	37
+35	counter	counter	104	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+69	board	board	100	38	7	board	otherstructure	Objects							board_panel	35
+100	soap dispenser	soap dispenser	99	40	7		otherprop	Objects					n04254120	soap_dispenser.n.01	objects	39
+62	stove	stove	95	38	7	stove	otherstructure	Objects			stove	4330267	n04330267	stove.n.02	appliances	37
+105	light	light	93	38	7	light	otherstructure	Objects					n03665366	light.n.02	lighting	28
+1165	closet wall	closet wall	90	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+165	mini fridge	mini fridge	87	24	6	refridgerator	refridgerator	Furniture					n03273913	electric_refrigerator.n.01	appliances	37
+7	cabinets	cabinet	79	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+5	doors	door	76	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+76	fan	fan	75	40	7	fan	otherprop	Objects					n03320046	fan.n.01	misc	40
+230	tissue box	tissue box	73	40	7	tissue box	otherprop	Objects					n02883344	box.n.01	objects	39
+54	blanket	blanket	72	40	7	blanket	otherprop	Objects					n02849154	blanket.n.01	objects	39
+125	bathroom stall	bathroom stall	71	38	7		otherstructure	Objects					n02873839	booth.n.02	misc	40
+72	copier	copier	70	40	7		otherprop	Objects					n03257586	duplicator.n.01	appliances	37
+68	bench	bench	66	39	6	bench	otherfurniture	Furniture	bench		bench	2828884	n02828884	bench.n.01	seating	34
+145	bar	bar	66	38	7	bar	otherstructure	Objects					n02788689	bar.n.03	misc	40
+157	soap dish	soap dish	65	40	7	soap dish	otherprop	Objects					n04254009	soap_dish.n.01	objects	39
+1166	laundry hamper	laundry hamper	65	40	7	laundry basket	otherprop	Objects							objects	39
+132	storage bin	storage bin	63	40	7	storage bin	otherprop	Objects							objects	39
+1167	bathroom stall door	bathroom stall door	62	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+232	light switch	light switch	61	38	7	light switch	otherstructure	Objects					n04372370	switch.n.01	misc	40
+134	coffee maker	coffee maker	61	40	7		otherprop	Objects					n03063338	coffee_maker.n.01	appliances	37
+51	tv stand	tv stand	61	39	6	tv stand	otherfurniture	Furniture	tv_stand				n03290653	entertainment_center.n.01	furniture	36
+250	decoration	decoration	60	40	7		otherprop	Objects					n03169390	decoration.n.01	misc	40
+1168	ceiling light	ceiling light	59	38	7	light	otherstructure	Objects					n03665366	light.n.02	lighting	28
+342	range hood	range hood	59	38	7	range hood	otherstructure	Objects	range_hood				n04053677	range_hood.n.01	misc	40
+89	blackboard	blackboard	58	38	7	blackboard	otherstructure	Objects					n02846511	blackboard.n.01	board_panel	35
+103	clock	clock	58	40	7	clock	otherprop	Objects			clock	3046257	n03046257	clock.n.01	objects	39
+99	wardrobe closet	wardrobe	54	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+95	rail	rail	53	38	7	railing	otherstructure	Objects					n04047401	railing.n.01	railing	30
+154	bulletin board	bulletin board	53	38	7	board	otherstructure	Objects					n03211616	display_panel.n.01	board_panel	35
+140	mat	mat	52	20	5	floor mat	floor mat	Floor					n03727837	mat.n.01	floor	2
+1169	trash bin	trash bin	52	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+193	ledge	ledge	51	38	7		otherstructure	Objects					n09337253	ledge.n.01	misc	40
+116	seat	seat	49	39	6	furniture	otherfurniture	Furniture					n04161981	seat.n.03	furniture	36
+202	mouse	mouse	49	40	7	mouse	otherprop	Objects					n03793489	mouse.n.04	objects	39
+73	basket	basket	48	40	7	basket	otherprop	Objects			basket	2801938	n02801938	basket.n.01	objects	39
+78	shower	shower	48	38	7		otherstructure	Objects					n04208936	shower.n.01	shower	23
+1170	dumbbell	dumbbell	48	40	7		otherprop	Objects					n03255030	dumbbell.n.01	objects	39
+79	paper	paper	46	26	7	paper	paper	Objects					n14974264	paper.n.01	objects	39
+80	person	person	46	31	7	person	person	Objects	person				n05217688	person.n.02	misc	40
+141	windowsill	windowsill	45	38	7		otherstructure	Objects					n04590263	windowsill.n.01	window	9
+57	closet	closet	45	39	6	wardrobe	otherfurniture	Furniture	wardrobe						misc	40
+102	bucket	bucket	45	40	7	bucket	otherprop	Objects					n02909870	bucket.n.01	misc	40
+261	sign	sign	44	40	7	sign	otherprop	Objects					n04217882	signboard.n.01	objects	39
+118	speaker	speaker	43	40	7	speaker	otherprop	Objects			speaker	3691459	n03691459	loudspeaker.n.01	objects	39
+136	dishwasher	dishwasher	43	38	7	dishwasher	otherstructure	Objects			dishwasher	3207941	n03207941	dishwasher.n.01	appliances	37
+98	container	container	43	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1171	stair rail	stair rail	42	38	7	banister	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+170	shower curtain rod	shower curtain rod	42	40	7		otherprop	Objects							curtain	12
+1172	tube	tube	41	40	7		otherprop	Objects							misc	40
+1173	bathroom cabinet	bathroom cabinet	39	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+79	papers	paper	39	26	7	paper	paper	Objects					n14974264	paper.n.01	objects	39
+221	storage container	storage container	39	40	7	container	otherprop	Objects							objects	39
+570	paper bag	paper bag	39	37	7	bag	bag	Objects					n04122825	sack.n.01	objects	39
+138	paper towel roll	paper towel roll	39	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+168	ball	ball	39	40	7	ball	otherprop	Objects							objects	39
+276	closet doors	closet door	38	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+106	laundry basket	laundry basket	37	40	7	laundry basket	otherprop	Objects			basket	2801938	n03050864	clothes_hamper.n.01	objects	39
+214	cart	cart	37	40	7	cart	otherprop	Objects					n03484083	handcart.n.01	shelving	31
+276	closet door	closet door	35	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+323	dish rack	dish rack	35	40	7	dish rack	otherprop	Objects					n03207630	dish_rack.n.01	objects	39
+58	stairs	stairs	35	38	7	stairs	otherstructure	Objects					n04298308	stairway.n.01	stairs	16
+86	blinds	blinds	35	13	13	blinds	blinds	Window					n02851099	blind.n.03	blinds	32
+2	stack of chairs	chair	35	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+399	purse	purse	34	40	7	purse	otherprop	Objects					n02774152	bag.n.04	objects	39
+121	bicycle	bicycle	33	40	7	bicycle	otherprop	Objects			bicycle	2834778	n02834778	bicycle.n.01	objects	39
+185	tray	tray	32	40	7	tray	otherprop	Objects					n04476259	tray.n.01	objects	39
+300	plunger	plunger	30	40	7		otherprop	Objects					n03970156	plunger.n.03	objects	39
+180	paper cutter	paper cutter	30	40	7	paper cutter	otherprop	Objects					n03886940	paper_cutter.n.01	objects	39
+163	toilet paper dispenser	toilet paper dispenser	29	40	7		otherprop	Objects							objects	39
+26	boxes	box	29	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+66	bin	bin	28	40	7	bin	otherprop	Objects					n02839910	bin.n.01	objects	39
+208	toilet seat cover dispenser	toilet seat cover dispenser	28	40	7		otherprop	Objects							objects	39
+112	guitar	guitar	28	40	7	guitar	otherprop	Objects	guitar		guitar	3467517	n03467517	guitar.n.01	objects	39
+540	mailboxes	mailbox	28	29	7	box	box	Objects			mailbox	3710193	n03710193	mailbox.n.01	misc	40
+395	handicap bar	handicap bar	27	38	7	bar	otherstructure	Objects							misc	40
+166	fire extinguisher	fire extinguisher	27	40	7	fire extinguisher	otherprop	Objects					n03345837	fire_extinguisher.n.01	misc	40
+122	ladder	ladder	27	39	6	ladder	otherfurniture	Furniture	stairs				n03632277	ladder.n.01	stairs	16
+120	column	column	26	38	7	column	otherstructure	Objects					n03074380	column.n.06	column	24
+107	pipe	pipe	25	40	7	pipe	otherprop	Objects					n03944672	pipe.n.02	misc	40
+283	vacuum cleaner	vacuum cleaner	25	40	7		otherprop	Objects					n04517823	vacuum.n.04	objects	39
+88	plate	plate	24	40	7	plate	otherprop	Objects					n03959485	plate.n.04	objects	39
+90	piano	piano	24	39	6	piano	otherfurniture	Furniture	piano		piano	3928116	n03928116	piano.n.01	furniture	36
+177	water cooler	water cooler	24	39	6	water cooler	otherfurniture	Furniture					n04559166	water_cooler.n.01	misc	40
+1174	cd case	cd case	24	40	7		otherprop	Objects							objects	39
+562	bowl	bowl	24	40	7	bowl	otherprop	Objects	bowl		bowl	2880940	n02880940	bowl.n.03	objects	39
+1175	closet rod	closet rod	24	40	7		otherprop	Objects					n04100174	rod.n.01	misc	40
+1156	bathroom counter	bathroom counter	24	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+84	oven	oven	23	38	7	oven	otherstructure	Objects					n03862676	oven.n.01	appliances	37
+104	stand	stand	23	39	6	stand	otherfurniture	Furniture	table	table	table	4379243	n04301000	stand.n.04	table	5
+229	scale	scale	23	40	7	scale	otherprop	Objects					n04141975	scale.n.07	objects	39
+70	washing machine	washing machine	23	39	6	washing machine	otherfurniture	Furniture			washing_machine	4554684	n04554684	washer.n.03	appliances	37
+325	broom	broom	22	40	7	broom	otherprop	Objects					n02906734	broom.n.01	objects	39
+169	hat	hat	22	40	7	hat	otherprop	Objects					n03497657	hat.n.01	clothes	38
+128	shower wall	shower wall	22	1	12	wall	wall	Wall					n04208936	shower.n.01	wall	1
+331	guitar case	guitar case	21	40	7	guitar case	otherprop	Objects							objects	39
+87	rack	rack	21	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+488	water pitcher	water pitcher	21	40	7	pitcher	otherprop	Objects					n03950228	pitcher.n.02	objects	39
+776	laundry detergent	laundry detergent	21	40	7		otherprop	Objects							objects	39
+370	hair dryer	hair dryer	21	40	7	hair dryer	otherprop	Objects					n03483316	hand_blower.n.01	objects	39
+191	pillar	pillar	21	38	7	column	otherstructure	Objects					n03073977	column.n.07	column	24
+748	divider	divider	20	40	7		otherprop	Objects							wall	1
+242	power outlet	power outlet	19	40	7		otherprop	Objects							misc	40
+45	dining table	dining table	19	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+417	shower floor	shower floor	19	2	5	floor	floor	Floor					n04208936	shower.n.01	floor	2
+70	washing machines	washing machine	19	39	6	washing machine	otherfurniture	Furniture			washing_machine	4554684	n04554684	washer.n.03	appliances	37
+188	shower door	shower door	19	8	12	door	door	Wall	door				n04208936	shower.n.01	door	4
+1176	coffee kettle	coffee kettle	18	40	7	pot	otherprop	Objects					n03612814	kettle.n.01	objects	39
+1177	wardrobe cabinet	wardrobe	18	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+1178	structure	structure	18	38	7		otherstructure	Objects							misc	40
+18	bookshelves	bookshelf	17	10	6	bookshelf	bookshelf	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+110	clothes dryer	clothes dryer	17	39	6		otherfurniture	Furniture					n03251766	dryer.n.01	appliances	37
+148	toaster	toaster	17	40	7	toaster	otherprop	Objects					n04442312	toaster.n.02	appliances	37
+63	shoe	shoe	17	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+155	ironing board	ironing board	16	39	6	ironing board	otherfurniture	Furniture					n03586090	ironing_board.n.01	objects	39
+572	alarm clock	alarm clock	16	40	7	alarm clock	otherprop	Objects			clock	3046257	n02694662	alarm_clock.n.01	objects	39
+1179	shower head	shower head	15	38	7		otherstructure	Objects							shower	23
+28	lamp base	lamp	15	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+392	water bottle	water bottle	15	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n04557648	water_bottle.n.01	objects	39
+1180	keyboard piano	keyboard piano	15	39	6	piano	otherfurniture	Furniture	piano		piano	3928116	n03928116	piano.n.01	furniture	36
+609	projector screen	projector screen	15	38	7	projector screen	otherstructure	Objects							misc	40
+1181	case of water bottles	case of water bottles	15	40	7		otherprop	Objects							objects	39
+195	toaster oven	toaster oven	14	40	7	toaster oven	otherprop	Objects					n04442441	toaster_oven.n.01	appliances	37
+581	music stand	music stand	14	39	6	music stand	otherfurniture	Furniture					n03801760	music_stand.n.01	furniture	36
+58	staircase	stairs	14	38	7	stairs	otherstructure	Objects					n04298308	stairway.n.01	stairs	16
+1182	coat rack	coat rack	14	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	3
+1183	storage organizer	storage organizer	14	40	7		otherprop	Objects							shelving	3
+139	machine	machine	14	40	7	machine	otherprop	Objects					n03699975	machine.n.01	appliances	37
+1184	folded chair	folded chair	14	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1185	fire alarm	fire alarm	14	40	7		otherprop	Objects					n03343737	fire_alarm.n.02	misc	40
+156	fireplace	fireplace	13	38	7	fireplace	otherstructure	Objects					n03346455	fireplace.n.01	fireplace	27
+408	vent	vent	13	40	7		otherprop	Objects					n04526241	vent.n.01	misc	40
+213	furniture	furniture	13	39	6	furniture	otherfurniture	Furniture					n03405725	furniture.n.01	furniture	36
+1186	power strip	power strip	13	40	7		otherprop	Objects							objects	39
+1187	calendar	calendar	13	40	7		otherprop	Objects							objects	39
+1188	poster	poster	13	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+115	toilet paper holder	toilet paper holder	13	40	7	toilet paper holder	otherprop	Objects							objects	39
+1189	potted plant	potted plant	12	40	7	plant	otherprop	Objects	plant				n00017222	plant.n.02	plant	14
+304	stuffed animal	stuffed animal	12	40	7	stuffed animal	otherprop	Objects					n04399382	teddy.n.01	objects	39
+1190	luggage	luggage	12	40	7	luggage	otherprop	Objects					n02774630	baggage.n.01	objects	39
+21	curtains	curtain	12	16	13	curtain	curtain	Window	curtain				n03151077	curtain.n.01	curtain	12
+312	headphones	headphones	12	40	7		otherprop	Objects					n03261776	earphone.n.01	objects	39
+233	crate	crate	12	39	6	crate	otherfurniture	Furniture					n03127925	crate.n.01	objects	39
+286	candle	candle	12	40	7	candle	otherprop	Objects	lamp				n02948072	candle.n.01	objects	39
+264	projector	projector	12	40	7	projector	otherprop	Objects					n04009552	projector.n.02	objects	39
+110	clothes dryers	clothes dryer	12	39	6		otherfurniture	Furniture					n03251766	dryer.n.01	appliances	37
+1191	mattress	mattress	12	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+356	dustpan	dustpan	12	40	7		otherprop	Objects					n03259009	dustpan.n.02	objects	39
+25	drawer	drawer	11	39	6	drawer	otherfurniture	Furniture					n03233905	drawer.n.01	furniture	36
+750	rod	rod	11	40	7		otherprop	Objects			pistol	3948459	n03427202	gat.n.01	misc	40
+269	globe	globe	11	40	7	globe	otherprop	Objects							objects	39
+307	footrest	footrest	11	39	6	foot rest	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+410	piano bench	piano bench	11	39	6	piano bench	otherfurniture	Furniture	bench		bench	2828884	n02828884	bench.n.01	seating	34
+730	breakfast bar	breakfast bar	11	38	7	bar	otherstructure	Objects							counter	26
+216	step stool	step stool	11	40	7	step stool	otherprop	Objects	stool				n04315713	step_stool.n.01	stool	19
+1192	hand rail	hand rail	11	38	7	railing	otherstructure	Objects							railing	30
+119	vending machine	vending machine	11	40	7	machine	otherprop	Objects					n04525305	vending_machine.n.01	appliances	37
+682	ceiling fan	ceiling fan	11	40	7	fan	otherprop	Objects					n03320046	fan.n.01	misc	40
+434	swiffer	swiffer	11	40	7		otherprop	Objects							objects	39
+126	foosball table	foosball table	11	39	6	foosball table	otherfurniture	Furniture	table	table	table	4379243	n04379243	table.n.02	table	5
+919	jar	jar	11	40	7	jar	otherprop	Objects			jar	3593526	n03593526	jar.n.01	objects	39
+85	footstool	footstool	11	39	6	ottoman	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+1193	folded table	folded table	10	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+108	round table	round table	10	7	10	table	table	Table	table	table	table	4379243	n04114554	round_table.n.02	table	5
+135	hamper	hamper	10	40	7	basket	otherprop	Objects			basket	2801938	n03482405	hamper.n.02	objects	39
+1194	poster tube	poster tube	10	40	7		otherprop	Objects							objects	39
+432	case	case	10	40	7	case	otherprop	Objects							objects	39
+53	carpet	carpet	10	40	7	rug	otherprop	Objects					n04118021	rug.n.01	floor	2
+1195	thermostat	thermostat	10	40	7		otherprop	Objects					n04422875	thermostat.n.01	misc	40
+111	coat	coat	10	40	7	jacket	otherprop	Objects					n03057021	coat.n.01	clothes	38
+305	water fountain	water fountain	10	38	7	water fountain	otherstructure	Objects					n03241335	drinking_fountain.n.01	misc	40
+1125	smoke detector	smoke detector	10	40	7		otherprop	Objects							misc	40
+13	pillows	pillow	9	18	7	pillow	pillow	Objects			pillow	3938244	n03938244	pillow.n.01	cushion	8
+1196	flip flops	flip flops	9	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+1197	cloth	cloth	9	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+1198	banner	banner	9	40	7		otherprop	Objects					n02788021	banner.n.01	misc	40
+1199	clothes hanger	clothes hanger	9	40	7		otherprop	Objects					n03057920	coat_hanger.n.01	objects	39
+1200	whiteboard eraser	whiteboard eraser	9	40	7		otherprop	Objects							objects	39
+378	iron	iron	9	40	7		otherprop	Objects					n03584829	iron.n.04	objects	39
+591	instrument case	instrument case	9	40	7	case	otherprop	Objects							objects	39
+49	toilet paper rolls	toilet paper	9	40	7	toilet paper	otherprop	Objects					n15075141	toilet_tissue.n.01	objects	39
+92	soap	soap	9	40	7	soap	otherprop	Objects					n04253437	soap.n.01	objects	39
+1098	block	block	9	40	7		otherprop	Objects							misc	40
+291	wall hanging	wall hanging	8	40	7		otherprop	Objects					n03491178	hanging.n.01	picture	6
+1063	kitchen island	kitchen island	8	38	7	kitchen island	otherstructure	Objects					n03620600	kitchen_island.n.01	counter	26
+107	pipes	pipe	8	38	7		otherstructure	Objects							misc	40
+1135	toothbrush	toothbrush	8	40	7	toothbrush	otherprop	Objects					n04453156	toothbrush.n.01	objects	39
+189	shirt	shirt	8	40	7		otherprop	Objects					n04197391	shirt.n.01	clothes	38
+245	cutting board	cutting board	8	40	7	cutting board	otherprop	Objects					n03025513	chopping_board.n.01	objects	39
+194	vase	vase	8	40	7	vase	otherprop	Objects	vase		jar	3593526	n04522168	vase.n.01	objects	39
+1201	shower control valve	shower control valve	8	38	7		otherstructure	Objects					n04208936	shower.n.01	shower	23
+386	exercise machine	exercise machine	8	40	7	machine	otherprop	Objects							gym_equipment	33
+1202	compost bin	compost bin	8	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+857	shorts	shorts	8	40	7	shorts	otherprop	Objects							clothes	38
+452	tire	tire	8	40	7		otherprop	Objects					n04440749	tire.n.01	objects	39
+1203	teddy bear	teddy bear	7	40	7	stuffed animal	otherprop	Objects					n04399382	teddy.n.01	objects	39
+346	bathrobe	bathrobe	7	40	7		otherprop	Objects					n02807616	bathrobe.n.01	clothes	38
+152	handrail	handrail	7	38	7	railing	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+83	faucet	faucet	7	40	7	faucet	otherprop	Objects			faucet	3325088	n03325088	faucet.n.01	misc	40
+1204	pantry wall	pantry wall	7	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+726	thermos	thermos	7	40	7	flask	otherprop	Objects	bottle		bottle	2876657	n04422727	thermos.n.01	objects	39
+61	rug	rug	7	40	7	rug	otherprop	Objects					n04118021	rug.n.01	floor	2
+39	couch cushions	cushion	7	18	7	pillow	pillow	Objects					n03151500	cushion.n.03	cushion	8
+1117	tripod	tripod	7	39	6	stand	otherfurniture	Furniture					n04485082	tripod.n.01	objects	39
+540	mailbox	mailbox	7	29	7	box	box	Objects			mailbox	3710193	n03710193	mailbox.n.01	misc	40
+1205	tupperware	tupperware	7	40	7		otherprop	Objects							objects	39
+415	shoe rack	shoe rack	7	40	7	shoe rack	otherprop	Objects							shelving	31
+31	towels	towel	6	27	7	towel	towel	Objects					n04459362	towel.n.01	towel	20
+1206	beer bottles	beer bottle	6	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+153	treadmill	treadmill	6	39	6	treadmill	otherfurniture	Furniture					n04477387	treadmill.n.01	gym_equipment	33
+1207	salt	salt	6	40	7		otherprop	Objects							objects	39
+129	chest	chest	6	39	6	chest	otherfurniture	Furniture	dresser	dresser					chest_of_drawers	13
+220	dispenser	dispenser	6	40	7		otherprop	Objects					n03210683	dispenser.n.01	objects	39
+1208	mirror doors	mirror door	6	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+231	remote	remote	6	40	7		otherprop	Objects			remote_control	4074963	n04074963	remote_control.n.01	objects	39
+1209	folded ladder	folded ladder	6	39	6	ladder	otherfurniture	Furniture	stairs				n03632277	ladder.n.01	misc	40
+39	cushion	cushion	6	18	7	pillow	pillow	Objects					n03151500	cushion.n.03	cushion	8
+1210	carton	carton	6	40	7		otherprop	Objects							objects	39
+117	step	step	6	38	7		otherstructure	Objects					n04314914	step.n.04	misc	40
+822	drying rack	drying rack	6	39	6	drying rack	otherfurniture	Furniture							shelving	31
+238	slippers	slipper	6	40	7	shoe	otherprop	Objects					n04241394	slipper.n.01	clothes	38
+143	pool table	pool table	6	39	6	pool table	otherfurniture	Furniture	table	table	table	4379243	n03982430	pool_table.n.01	table	5
+1211	soda stream	soda stream	6	40	7		otherprop	Objects							objects	39
+228	toilet brush	toilet brush	6	40	7	toilet brush	otherprop	Objects							objects	39
+494	loft bed	loft bed	6	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+226	cooking pot	cooking pot	6	40	7	pot	otherprop	Objects							objects	39
+91	heater	heater	6	39	6	heater	otherfurniture	Furniture					n03508101	heater.n.01	misc	40
+1072	messenger bag	messenger bag	6	37	7	bag	bag	Objects							objects	39
+435	stapler	stapler	6	40	7	stapler	otherprop	Objects					n04303497	stapler.n.01	objects	39
+1165	closet walls	closet wall	5	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+345	scanner	scanner	5	40	7		otherprop	Objects							appliances	37
+893	elliptical machine	elliptical machine	5	40	7	machine	otherprop	Objects							gym_equipment	33
+621	kettle	kettle	5	40	7	pot	otherprop	Objects					n03612814	kettle.n.01	objects	39
+1212	metronome	metronome	5	40	7		otherprop	Objects					n03757604	metronome.n.01	objects	39
+297	dumbell	dumbell	5	40	7		otherprop	Objects							objects	39
+1213	music book	music book	5	23	2	book	books	Books					n02870526	book.n.11	objects	39
+1214	rice cooker	rice cooker	5	40	7		otherprop	Objects							objects	39
+1215	dart board	dart board	5	38	7	board	otherstructure	Objects					n03162940	dartboard.n.01	objects	39
+529	sewing machine	sewing machine	5	40	7	sewing machine	otherprop	Objects					n04179913	sewing_machine.n.01	objects	39
+1216	grab bar	grab bar	5	38	7	railing	otherstructure	Objects							railing	30
+1217	flowerpot	flowerpot	5	40	7	vase	otherprop	Objects	vase		jar	3593526	n04522168	vase.n.01	objects	39
+1218	painting	painting	5	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+1219	railing	railing	5	38	7	railing	otherstructure	Objects					n04047401	railing.n.01	railing	30
+1220	stair	stair	5	38	7	stairs	otherstructure	Objects	stairs				n04314914	step.n.04	stairs	16
+525	toolbox	toolbox	5	39	6	chest	otherfurniture	Furniture					n04452615	toolbox.n.01	objects	39
+204	nerf gun	nerf gun	5	40	7		otherprop	Objects							objects	39
+693	binders	binder	5	40	7	binder	otherprop	Objects							objects	39
+179	desk lamp	desk lamp	5	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+1221	quadcopter	quadcopter	5	40	7		otherprop	Objects							objects	39
+1222	pitcher	pitcher	5	40	7	pitcher	otherprop	Objects					n03950228	pitcher.n.02	objects	39
+1223	hanging	hanging	5	40	7		otherprop	Objects							misc	40
+1224	mail	mail	5	40	7		otherprop	Objects							misc	40
+1225	closet ceiling	closet ceiling	5	22	3	ceiling	ceiling	Ceiling					n02990373	ceiling.n.01	ceiling	17
+1226	hoverboard	hoverboard	5	40	7		otherprop	Objects							objects	39
+1227	beanbag chair	beanbag chair	5	39	6	bean bag	otherfurniture	Furniture					n02816656	beanbag.n.01	chair	3
+571	water heater	water heater	5	40	7	water heater	otherprop	Objects					n04560113	water_heater.n.01	misc	40
+1228	spray bottle	spray bottle	5	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+556	rope	rope	5	40	7	rope	otherprop	Objects					n04108268	rope.n.01	objects	39
+280	plastic container	plastic container	5	40	7	container	otherprop	Objects							objects	39
+1229	soap bottle	soap bottle	5	40	7	soap	otherprop	Objects							objects	39
+1230	ikea bag	ikea bag	4	37	7	bag	bag	Objects				2773838	n02773838	bag.n.06	objects	39
+1231	sleeping bag	sleeping bag	4	40	7		otherprop	Objects					n04235860	sleeping_bag.n.01	objects	39
+1232	duffel bag	duffel bag	4	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+746	frying pan	frying pan	4	40	7	frying pan	otherprop	Objects					n03400231	frying_pan.n.01	objects	39
+1233	oven mitt	oven mitt	4	40	7		otherprop	Objects							objects	39
+1234	pot	pot	4	40	7	pot	otherprop	Objects					n04235860	sleeping_bag.n.01	objects	39
+144	hand dryer	hand dryer	4	40	7		otherprop	Objects							objects	39
+282	dollhouse	dollhouse	4	39	6	doll house	otherfurniture	Furniture					n03219483	dollhouse.n.01	objects	39
+167	shampoo bottle	shampoo bottle	4	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1235	hair brush	hair brush	4	40	7		otherprop	Objects					n02908217	brush.n.02	objects	39
+1236	tennis racket	tennis racket	4	40	7		otherprop	Objects					n04409806	tennis_racket.n.01	objects	39
+1237	display case	display case	4	40	7	case	otherprop	Objects							objects	39
+234	ping pong table	ping pong table	4	39	6	ping pong table	otherfurniture	Furniture	table	table	table	4379243	n04379243	table.n.02	table	5
+563	boiler	boiler	4	40	7		otherprop	Objects							misc	40
+1238	bag of coffee beans	bag of coffee beans	4	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+1239	bananas	banana	4	40	7		otherprop	Objects					n00021265	food.n.01	objects	39
+1240	carseat	carseat	4	40	7		otherprop	Objects							misc	40
+366	helmet	helmet	4	40	7		otherprop	Objects			helmet	3513137	n03513137	helmet.n.02	clothes	38
+816	umbrella	umbrella	4	40	7	umbrella	otherprop	Objects					n04507155	umbrella.n.01	objects	39
+1241	coffee box	coffee box	4	40	7		otherprop	Objects							objects	39
+719	envelope	envelope	4	40	7	envelope	otherprop	Objects					n03291819	envelope.n.01	objects	39
+284	wet floor sign	wet floor sign	4	40	7	sign	otherprop	Objects							misc	40
+1242	clothing rack	clothing rack	4	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+247	controller	controller	4	40	7		otherprop	Objects					n03096960	control.n.09	objects	39
+1243	bath walls	bathroom wall	4	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+1244	podium	podium	4	39	6		otherfurniture	Furniture					n03159640	dais.n.01	furniture	36
+1245	storage box	storage box	4	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1246	dolly	dolly	4	40	7		otherprop	Objects							misc	40
+1247	shampoo	shampoo	3	40	7		otherprop	Objects					n04183516	shampoo.n.01	objects	39
+592	paper tray	paper tray	3	40	7	paper tray	otherprop	Objects							objects	39
+385	cabinet door	cabinet door	3	8	12	door	door	Wall	door						door	4
+1248	changing station	changing station	3	40	7		otherprop	Objects							misc	40
+1249	poster printer	poster printer	3	40	7	printer	otherprop	Objects			printer	4004475	n04004475	printer.n.03	appliances	37
+133	screen	screen	3	40	7		otherprop	Objects					n03151077	curtain.n.01	curtain	12
+301	soap bar	soap bar	3	38	7	bar	otherstructure	Objects							objects	39
+1250	crutches	crutches	3	40	7		otherprop	Objects					n03141823	crutch.n.01	objects	39
+379	studio light	studio light	3	38	7	light	otherstructure	Objects							lighting	28
+130	stack of cups	cup	3	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+1251	toilet flush button	toilet flush button	3	40	7		otherprop	Objects							objects	39
+450	trunk	trunk	3	40	7		otherprop	Objects							misc	40
+1252	grocery bag	grocery bag	3	37	7	bag	bag	Objects			suitcase	2773838	n03461288	grocery_bag.n.01	objects	39
+316	plastic bin	plastic bin	3	40	7	bin	otherprop	Objects							objects	39
+1253	pizza box	pizza box	3	29	7	box	box	Objects							objects	39
+385	cabinet doors	cabinet door	3	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	door	4
+1254	legs	legs	3	31	7	person	person	Objects	person				n05217688	person.n.02	misc	40
+461	car	car	3	40	7	car	otherprop	Objects	car		car	2958343	n02958343	car.n.01	misc	40
+1255	shaving cream	shaving cream	3	40	7		otherprop	Objects					n04186051	shaving_cream.n.01	objects	39
+1256	luggage stand	luggage stand	3	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+599	shredder	shredder	3	40	7		otherprop	Objects					n04210120	shredder.n.01	objects	39
+281	statue	statue	3	40	7	sculpture	otherprop	Objects					n04306847	statue.n.01	misc	40
+1257	urinal	urinal	3	33	7	toilet	toilet	Objects	toilet	toilet			n04515991	urinal.n.01	toilet	18
+1258	hose	hose	3	40	7		otherprop	Objects					n03539875	hose.n.03	misc	40
+1259	bike pump	bike pump	3	40	7		otherprop	Objects							objects	39
+319	coatrack	coatrack	3	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+1260	bear	bear	3	40	7		otherprop	Objects							objects	39
+28	wall lamp	lamp	3	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+1261	humidifier	humidifier	3	40	7		otherprop	Objects							objects	39
+546	toothpaste	toothpaste	3	40	7	toothpaste	otherprop	Objects							objects	39
+1262	mouthwash bottle	mouthwash bottle	3	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1263	poster cutter	poster cutter	3	40	7		otherprop	Objects							objects	39
+1264	golf bag	golf bag	3	37	7	bag	bag	Objects			suitcase	2773838	n03445617	golf_bag.n.01	objects	39
+1265	food container	food container	3	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1266	camera	camera	3	40	7		otherprop	Objects							objects	39
+28	table lamp	lamp	3	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n04380533	table_lamp.n.01	lighting	28
+1267	yoga mat	yoga mat	3	20	5	floor mat	floor mat	Floor					n03727837	mat.n.01	floor	2
+1268	card	card	3	40	7		otherprop	Objects							objects	39
+1269	mug	mug	3	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+188	shower doors	shower door	3	38	7		otherstructure	Objects					n04208936	shower.n.01	door	4
+689	cardboard	cardboard	3	40	7		otherprop	Objects							objects	39
+1270	rack stand	rack stand	3	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+1271	boxes of paper	boxes of paper	3	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1272	flag	flag	3	40	7		otherprop	Objects							misc	40
+354	futon	futon	3	39	6	mattress	otherfurniture	Furniture					n03408444	futon.n.01	sofa	10
+339	magazine	magazine	3	40	7	magazine	otherprop	Objects					n06595351	magazine.n.01	objects	39
+1009	exit sign	exit sign	3	40	7	exit sign	otherprop	Objects							misc	40
+1273	rolled poster	rolled poster	3	40	7		otherprop	Objects							objects	39
+1274	wheel	wheel	3	40	7		otherprop	Objects							objects	39
+15	pictures	picture	3	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+1275	blackboard eraser	blackboard eraser	3	40	7	eraser	otherprop	Objects					n03294833	eraser.n.01	objects	39
+361	organizer	organizer	3	40	7		otherprop	Objects					n03918737	personal_digital_assistant.n.01	objects	39
+1276	doll	doll	3	40	7	toy	otherprop	Objects					n03219135	doll.n.01	objects	39
+326	book rack	book rack	3	39	6	bookrack	otherfurniture	Furniture							objects	39
+1277	laundry bag	laundry bag	3	40	7	laundry basket	otherprop	Objects			basket	2801938	n03050864	clothes_hamper.n.01	objects	39
+1278	sponge	sponge	3	40	7		otherprop	Objects					n01906749	sponge.n.04	objects	39
+116	seating	seat	3	39	6	furniture	otherfurniture	Furniture					n04161981	seat.n.03	furniture	36
+1184	folded chairs	folded chair	2	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1279	lotion bottle	lotion bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+212	can	can	2	40	7	can	otherprop	Objects			can	2946921	n02946921	can.n.01	objects	39
+1280	lunch box	lunch box	2	40	7		otherprop	Objects							objects	39
+1281	food display	food display	2	40	7		otherprop	Objects							misc	40
+794	storage shelf	storage shelf	2	40	7		otherprop	Objects							shelving	31
+1282	sliding wood door	sliding wood door	2	40	7		otherprop	Objects							door	4
+955	pants	pants	2	40	7		otherprop	Objects					n04489008	trouser.n.01	clothes	38
+387	wood	wood	2	40	7		otherprop	Objects							misc	40
+69	boards	board	2	38	7	board	otherstructure	Objects							board_panel	35
+65	bottles	bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+523	washcloth	washcloth	2	40	7		otherprop	Objects					n04554523	washcloth.n.01	towel	20
+389	workbench	workbench	2	39	6	bench	otherfurniture	Furniture	bench		table	4379243	n04600486	workbench.n.01	table	5
+29	open kitchen cabinet	kitchen cabinet	2	3	6	cabinet	cabinet	Furniture					n02933112	cabinet.n.01	cabinet	7
+1283	organizer shelf	organizer shelf	2	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+146	frame	frame	2	38	7		otherstructure	Objects							misc	40
+130	cups	cup	2	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+372	exercise ball	exercise ball	2	40	7	ball	otherprop	Objects					n04285146	sports_equipment.n.01	gym_equipment	33
+289	easel	easel	2	39	6	stand	otherfurniture	Furniture					n03262809	easel.n.01	furniture	36
+440	garbage bag	garbage bag	2	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+321	roomba	roomba	2	40	7		otherprop	Objects							objects	39
+976	garage door	garage door	2	38	7	garage door	otherstructure	Objects	door						door	4
+1256	luggage rack	luggage stand	2	39	6	stand	otherfurniture	Furniture					n04038440		shelving	31
+1284	bike lock	bike lock	2	40	7		otherprop	Objects							objects	39
+1285	briefcase	briefcase	2	40	7		otherprop	Objects					n02900705	briefcase.n.01	objects	39
+357	hand towel	hand towel	2	27	7	towel	towel	Objects					n03490006	hand_towel.n.01	towel	20
+1286	bath products	bath product	2	40	7		otherprop	Objects							objects	39
+1287	star	star	2	40	7		otherprop	Objects					n09444783	star.n.03	misc	40
+365	map	map	2	40	7	map	otherprop	Objects					n03720163	map.n.01	misc	40
+1288	coffee bean bag	coffee bean bag	2	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+81	headboard	headboard	2	39	6	headboard	otherfurniture	Furniture					n03502200	headboard.n.01	bed	11
+1289	ipad	ipad	2	40	7		otherprop	Objects							objects	39
+1290	display rack	display rack	2	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+948	traffic cone	traffic cone	2	40	7	cone	otherprop	Objects	cone						objects	39
+174	toiletry	toiletry	2	40	7		otherprop	Objects					n04447443	toiletry.n.01	objects	39
+1028	canopy	canopy	2	40	7		otherprop	Objects							misc	40
+1291	massage chair	massage chair	2	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1292	paper organizer	paper organizer	2	40	7		otherprop	Objects							objects	39
+1005	barricade	barricade	2	40	7		otherprop	Objects							misc	40
+235	platform	platform	2	38	7		otherstructure	Objects							misc	40
+1293	cap	cap	2	40	7	hat	otherprop	Objects					n03497657	hat.n.01	clothes	38
+1294	dumbbell plates	dumbbell plates	2	40	7		otherprop	Objects							objects	39
+1295	elevator	elevator	2	38	7		otherstructure	Objects							misc	40
+1296	cooking pan	cooking pan	2	40	7	pan	otherprop	Objects					n03880531	pan.n.01	objects	39
+1297	trash bag	trash bag	2	37	7	bag	bag	Objects							objects	39
+1298	santa	santa	2	40	7		otherprop	Objects							misc	40
+1299	jewelry box	jewelry box	2	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1300	boat	boat	2	40	7		otherprop	Objects							misc	40
+1301	sock	sock	2	21	7	clothes	clothes	Objects					n04254777	sock.n.01	clothes	38
+1051	kinect	kinect	2	40	7	kinect	otherprop	Objects							objects	39
+566	crib	crib	2	39	6	crib	otherfurniture	Furniture							furniture	36
+1302	plastic storage bin	plastic storage bin	2	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1062	cooler	cooler	2	24	6	refridgerator	refridgerator	Furniture					n03102654	cooler.n.01	appliances	37
+1303	kitchen apron	kitchen apron	2	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+1304	dishwashing soap bottle	dishwashing soap bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1305	xbox controller	xbox controller	2	40	7		otherprop	Objects							objects	39
+1306	banana holder	banana holder	2	40	7		otherprop	Objects							objects	39
+298	ping pong paddle	ping pong paddle	2	40	7		otherprop	Objects							table	5
+1307	airplane	airplane	2	40	7		otherprop	Objects							misc	40
+1308	conditioner bottle	conditioner bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1309	tea kettle	tea kettle	2	40	7	tea kettle	otherprop	Objects					n04397768	teakettle.n.01	objects	39
+43	bedframe	bedframe	2	39	6		otherfurniture	Furniture					n02822579	bedstead.n.01	bed	11
+1310	wood beam	wood beam	2	38	7		otherstructure	Objects							beam	29
+593	toilet paper package	toilet paper package	2	40	7		otherprop	Objects							objects	39
+1311	wall mounted coat rack	wall mounted coat rack	2	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+1312	film light	film light	2	40	7		otherprop	Objects							lighting	28
+749	ceiling lamp	ceiling lamp	1	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+623	chain	chain	1	40	7		otherprop	Objects							chair	3
+1313	sofa	sofa	1	6	9	sofa	sofa	Sofa	sofa	sofa	sofa	4256520	n04256520	sofa.n.01	sofa	10
+99	closet wardrobe	wardrobe	1	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+265	sweater	sweater	1	40	7		otherprop	Objects					n04370048	sweater.n.01	clothes	38
+1314	kitchen mixer	kitchen mixer	1	40	7		otherprop	Objects							appliances	37
+99	wardrobe	wardrobe	1	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+1315	water softener	water softener	1	40	7		otherprop	Objects							misc	40
+448	banister	banister	1	38	7	banister	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+257	trolley	trolley	1	40	7	trolley	otherprop	Objects					n04335435	streetcar.n.01	misc	40
+1316	pantry shelf	pantry shelf	1	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+786	sofa bed	sofa bed	1	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+801	loofa	loofa	1	40	7		otherprop	Objects							objects	39
+972	shower faucet handle	shower faucet handle	1	40	7	handle	otherprop	Objects							shower	23
+1317	toy piano	toy piano	1	40	7	toy	otherprop	Objects					n03964744	plaything.n.01	objects	39
+1318	fish	fish	1	40	7		otherprop	Objects					n02512053	fish.n.01	objects	39
+75	file cabinets	file cabinet	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n03337140	file.n.03	cabinet	7
+657	cat litter box	cat litter box	1	29	7	box	box	Objects							objects	39
+561	electric panel	electric panel	1	40	7		otherprop	Objects							misc	40
+93	suitcases	suitcase	1	40	7	luggage	otherprop	Objects					n02774630	baggage.n.01	objects	39
+513	curtain rod	curtain rod	1	38	7	curtain rod	otherstructure	Objects							curtain	12
+411	bunk bed	bunk bed	1	39	6	bunk bed	otherfurniture	Furniture	bed	bed	bed	2818832	n02920259	bunk_bed.n.01	bed	11
+1122	chandelier	chandelier	1	38	7	chandelier	otherstructure	Objects					n03005285	chandelier.n.01	lighting	28
+922	tape	tape	1	40	7	tape	otherprop	Objects							objects	39
+88	plates	plate	1	40	7		otherprop	Objects					n03959485	plate.n.04	objects	39
+518	alarm	alarm	1	40	7	alarm	otherprop	Objects			clock	3046257	n02694662	alarm_clock.n.01	objects	39
+814	fire hose	fire hose	1	40	7		otherprop	Objects					n03346004	fire_hose.n.01	misc	40
+1319	toy dinosaur	toy dinosaur	1	40	7	toy	otherprop	Objects					n03964744	plaything.n.01	objects	39
+1320	cone	cone	1	40	7		otherprop	Objects							objects	39
+649	glass doors	glass door	1	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+607	hatrack	hatrack	1	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+819	subwoofer	subwoofer	1	40	7	speaker	otherprop	Objects			speaker	3691459	n04349401	subwoofer.n.01	objects	39
+1321	fire sprinkler	fire sprinkler	1	40	7		otherprop	Objects							misc	40
+1322	trash cabinet	trash cabinet	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+1204	pantry walls	pantry wall	1	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+227	photo	photo	1	40	7	photo	otherprop	Objects					n03925226	photograph.n.01	picture	6
+817	barrier	barrier	1	40	7		otherprop	Objects					n02796623	barrier.n.01	misc	40
+130	stacks of cups	cup	1	40	7		otherprop	Objects					n03147509	cup.n.01	objects	39
+712	beachball	beachball	1	40	7	ball	otherprop	Objects					n02814224	beach_ball.n.01	objects	39
+1323	folded boxes	folded boxes	1	40	7		otherprop	Objects							objects	39
+1324	contact lens solution bottle	contact lens solution bottle	1	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+673	covered box	covered box	1	29	7	box	box	Objects							objects	39
+459	folder	folder	1	40	7	folder	otherprop	Objects					n03376279	folder.n.02	objects	39
+643	mail trays	mail tray	1	40	7	mail tray	otherprop	Objects							objects	39
+238	slipper	slipper	1	40	7		otherprop	Objects					n04241394	slipper.n.01	clothes	38
+765	magazine rack	magazine rack	1	39	6	stand	otherfurniture	Furniture					n03704549	magazine_rack.n.01	shelving	31
+1008	sticker	sticker	1	40	7	sticker	otherprop	Objects					n07272545	gummed_label.n.01	objects	39
+225	lotion	lotion	1	40	7		otherprop	Objects					n03690938	lotion.n.01	objects	39
+1083	buddha	buddha	1	40	7		otherprop	Objects							objects	39
+813	file organizer	file organizer	1	40	7		otherprop	Objects							objects	39
+138	paper towel rolls	paper towel roll	1	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+1145	night lamp	night lamp	1	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+796	fuse box	fuse box	1	40	7		otherprop	Objects							misc	40
+1325	knife block	knife block	1	40	7		otherprop	Objects							objects	39
+363	furnace	furnace	1	39	6	furnace	otherfurniture	Furniture					n03404449	furnace.n.01		
+1174	cd cases	cd case	1	40	7		otherprop	Objects							objects	39
+38	stools	stool	1	40	7	stool	otherprop	Objects	stool				n04326896	stool.n.01	stool	19
+1326	hand sanitzer dispenser	hand sanitzer dispenser	1	40	7		otherprop	Objects					n04254120	soap_dispenser.n.01	objects	39
+997	teapot	teapot	1	40	7	tea pot	otherprop	Objects					n04398044	teapot.n.01	objects	39
+1327	pen holder	pen holder	1	40	7		otherprop	Objects							objects	39
+1328	tray rack	tray rack	1	40	7		otherprop	Objects							objects	39
+1329	wig	wig	1	40	7		otherprop	Objects					n04584207	wig.n.01	objects	39
+182	switch	switch	1	40	7		otherprop	Objects					n04372370	switch.n.01	misc	40
+280	plastic containers	plastic container	1	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1330	night light	night light	1	40	7		otherprop	Objects							lighting	28
+1331	notepad	notepad	1	40	7		otherprop	Objects							objects	39
+1332	mail bin	mail bin	1	40	7		otherprop	Objects							misc	40
+1333	elevator button	elevator button	1	40	7		otherprop	Objects							misc	40
+939	gaming wheel	gaming wheel	1	40	7		otherprop	Objects							objects	39
+1334	drum set	drum set	1	40	7		otherprop	Objects							objects	39
+480	cosmetic bag	cosmetic bag	1	37	7	bag	bag	Objects							objects	39
+907	coffee mug	coffee mug	1	40	7	vessel	otherprop	Objects			cup or mug	3797390	n03063599	coffee_mug.n.01	objects	39
+1335	closet shelf	closet shelf	1	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+1336	baby mobile	baby mobile	1	40	7		otherprop	Objects							objects	39
+829	diaper bin	diaper bin	1	40	7	bin	otherprop	Objects							objects	39
+947	door wall	door wall	1	1	12	wall	wall	Wall							wall	1
+1116	stepstool	stepstool	1	40	7	step stool	otherprop	Objects							objects	39
+599	paper shredder	shredder	1	40	7		otherprop	Objects					n04210120	shredder.n.01	objects	39
+733	dress rack	dress rack	1	40	7		otherprop	Objects					n03238762	dress_rack.n.01	misc	40
+123	cover	cover	1	40	7	blanket	otherprop	Objects							objects	39
+506	shopping bag	shopping bag	1	37	7	bag	bag	Objects					n04204081	shopping_bag.n.01	objects	39
+569	sliding door	sliding door	1	8	12	door	door	Wall	door				n04239074	sliding_door.n.01	door	4
+1337	exercise bike	exercise bike	1	40	7	machine	otherprop	Objects					n04210120	shredder.n.01	gym_equipment	33
+1338	recliner chair	recliner chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03238762	dress_rack.n.01	chair	3
+1314	kitchenaid mixer	kitchen mixer	1	40	7		otherprop	Objects							appliances	37
+1339	soda can	soda can	1	40	7	can	otherprop	Objects			can	2946921	n02946921	can.n.01	objects	39
+1340	stovetop	stovetop	1	38	7	stove	otherstructure	Objects			stove	4330267	n04330267	stove.n.02	appliances	37
+851	stepladder	stepladder	1	39	6	ladder	otherfurniture	Furniture	stairs				n04315599	step_ladder.n.01	stairs	16
+142	tap	tap	1	40	7	faucet	otherprop	Objects			faucet	3325088	n04559451	water_faucet.n.01	objects	39
+436	cable	cable	1	40	7	cables	otherprop	Objects							objects	39
+1341	baby changing station	baby changing station	1	39	6		otherfurniture	Furniture							furniture	36
+1342	costume	costume	1	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+885	rocking chair	rocking chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n04099969	rocking_chair.n.01	chair	3
+693	binder	binder	1	40	7	binder	otherprop	Objects							objects	39
+815	media center	media center	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+401	towel rack	towel rack	1	40	7		otherprop	Objects					n04459773	towel_rack.n.01	misc	40
+1343	medal	medal	1	40	7		otherprop	Objects							objects	39
+1184	stack of folded chairs	folded chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1344	telescope	telescope	1	40	7		otherprop	Objects					n04403638	telescope.n.01	objects	39
+1345	closet doorframe	closet doorframe	1	8	12	door	door	Wall	door						door	4
+160	glass	glass	1	38	7	glass	otherstructure	Objects					n03438257	glass.n.02	misc	40
+1126	baseball cap	baseball cap	1	40	7		otherprop	Objects			cap	2954340	n02799323	baseball_cap.n.01	clothes	38
+1346	battery disposal jar	battery disposal jar	1	40	7	jar	otherprop	Objects			jar	3593526	n03593526	jar.n.01	objects	39
+332	mop	mop	1	40	7		otherprop	Objects					n04367480	swab.n.02	objects	39
+397	tank	tank	1	40	7		otherprop	Objects							objects	39
+643	mail tray	mail tray	1	40	7	mail tray	otherprop	Objects							objects	39
+551	centerpiece	centerpiece	1	40	7	centerpiece	otherprop	Objects					n02994419	centerpiece.n.02	objects	39
+1163	stick	object	1	40	7	stick	otherprop	Objects							objects	39
+1347	closet floor	closet floor	1	2	5	floor	floor	Floor					n03365592	floor.n.01	floor	2
+1348	dryer sheets	dryer sheets	1	40	7		otherprop	Objects							objects	39
+803	bycicle	bycicle	1	40	7		otherprop	Objects							misc	40
+484	flower stand	flower stand	1	39	6	stand	otherfurniture	Furniture							furniture	36
+1349	air mattress	air mattress	1	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02690809	air_mattress.n.01	bed	11
+1350	clip	clip	1	40	7		otherprop	Objects							objects	39
+222	side table	side table	1	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+1253	pizza boxes	pizza box	1	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1351	display	display	1	39	7		otherfurniture	Furniture					n03211117	display.n.06	misc	40
+1352	postcard	postcard	1	40	7		otherprop	Objects							objects	39
+828	display sign	display sign	1	40	7	sign	otherprop	Objects							misc	40
+1353	paper towel	paper towel	1	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+612	boots	boot	1	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+1354	tennis racket bag	tennis racket bag	1	40	7		otherprop	Objects							objects	39
+1355	air hockey table	air hockey table	1	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+1301	socks	sock	1	21	7	clothes	clothes	Objects					n04254777	sock.n.01	clothes	38
+1356	food bag	food bag	1	37	7	bag	bag	Objects							objects	39
+1199	clothes hangers	clothes hanger	1	40	7		otherprop	Objects					n03057920	coat_hanger.n.01	misc	40
+1357	starbucks cup	starbucks cup	1	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
\ No newline at end of file
diff --git a/data/scannet/meta_data/scannetv2_test.txt b/data/scannet/meta_data/scannetv2_test.txt
new file mode 100644
index 0000000..79d15b0
--- /dev/null
+++ b/data/scannet/meta_data/scannetv2_test.txt
@@ -0,0 +1,100 @@
+scene0707_00
+scene0708_00
+scene0709_00
+scene0710_00
+scene0711_00
+scene0712_00
+scene0713_00
+scene0714_00
+scene0715_00
+scene0716_00
+scene0717_00
+scene0718_00
+scene0719_00
+scene0720_00
+scene0721_00
+scene0722_00
+scene0723_00
+scene0724_00
+scene0725_00
+scene0726_00
+scene0727_00
+scene0728_00
+scene0729_00
+scene0730_00
+scene0731_00
+scene0732_00
+scene0733_00
+scene0734_00
+scene0735_00
+scene0736_00
+scene0737_00
+scene0738_00
+scene0739_00
+scene0740_00
+scene0741_00
+scene0742_00
+scene0743_00
+scene0744_00
+scene0745_00
+scene0746_00
+scene0747_00
+scene0748_00
+scene0749_00
+scene0750_00
+scene0751_00
+scene0752_00
+scene0753_00
+scene0754_00
+scene0755_00
+scene0756_00
+scene0757_00
+scene0758_00
+scene0759_00
+scene0760_00
+scene0761_00
+scene0762_00
+scene0763_00
+scene0764_00
+scene0765_00
+scene0766_00
+scene0767_00
+scene0768_00
+scene0769_00
+scene0770_00
+scene0771_00
+scene0772_00
+scene0773_00
+scene0774_00
+scene0775_00
+scene0776_00
+scene0777_00
+scene0778_00
+scene0779_00
+scene0780_00
+scene0781_00
+scene0782_00
+scene0783_00
+scene0784_00
+scene0785_00
+scene0786_00
+scene0787_00
+scene0788_00
+scene0789_00
+scene0790_00
+scene0791_00
+scene0792_00
+scene0793_00
+scene0794_00
+scene0795_00
+scene0796_00
+scene0797_00
+scene0798_00
+scene0799_00
+scene0800_00
+scene0801_00
+scene0802_00
+scene0803_00
+scene0804_00
+scene0805_00
+scene0806_00
diff --git a/data/scannet/meta_data/scannetv2_train.txt b/data/scannet/meta_data/scannetv2_train.txt
new file mode 100644
index 0000000..ef625f1
--- /dev/null
+++ b/data/scannet/meta_data/scannetv2_train.txt
@@ -0,0 +1,1201 @@
+scene0191_00
+scene0191_01
+scene0191_02
+scene0119_00
+scene0230_00
+scene0528_00
+scene0528_01
+scene0705_00
+scene0705_01
+scene0705_02
+scene0415_00
+scene0415_01
+scene0415_02
+scene0007_00
+scene0141_00
+scene0141_01
+scene0141_02
+scene0515_00
+scene0515_01
+scene0515_02
+scene0447_00
+scene0447_01
+scene0447_02
+scene0531_00
+scene0503_00
+scene0285_00
+scene0069_00
+scene0584_00
+scene0584_01
+scene0584_02
+scene0581_00
+scene0581_01
+scene0581_02
+scene0620_00
+scene0620_01
+scene0263_00
+scene0263_01
+scene0481_00
+scene0481_01
+scene0020_00
+scene0020_01
+scene0291_00
+scene0291_01
+scene0291_02
+scene0469_00
+scene0469_01
+scene0469_02
+scene0659_00
+scene0659_01
+scene0024_00
+scene0024_01
+scene0024_02
+scene0564_00
+scene0117_00
+scene0027_00
+scene0027_01
+scene0027_02
+scene0028_00
+scene0330_00
+scene0418_00
+scene0418_01
+scene0418_02
+scene0233_00
+scene0233_01
+scene0673_00
+scene0673_01
+scene0673_02
+scene0673_03
+scene0673_04
+scene0673_05
+scene0585_00
+scene0585_01
+scene0362_00
+scene0362_01
+scene0362_02
+scene0362_03
+scene0035_00
+scene0035_01
+scene0358_00
+scene0358_01
+scene0358_02
+scene0037_00
+scene0194_00
+scene0321_00
+scene0293_00
+scene0293_01
+scene0623_00
+scene0623_01
+scene0592_00
+scene0592_01
+scene0569_00
+scene0569_01
+scene0413_00
+scene0313_00
+scene0313_01
+scene0313_02
+scene0480_00
+scene0480_01
+scene0401_00
+scene0517_00
+scene0517_01
+scene0517_02
+scene0032_00
+scene0032_01
+scene0613_00
+scene0613_01
+scene0613_02
+scene0306_00
+scene0306_01
+scene0052_00
+scene0052_01
+scene0052_02
+scene0053_00
+scene0444_00
+scene0444_01
+scene0055_00
+scene0055_01
+scene0055_02
+scene0560_00
+scene0589_00
+scene0589_01
+scene0589_02
+scene0610_00
+scene0610_01
+scene0610_02
+scene0364_00
+scene0364_01
+scene0383_00
+scene0383_01
+scene0383_02
+scene0006_00
+scene0006_01
+scene0006_02
+scene0275_00
+scene0451_00
+scene0451_01
+scene0451_02
+scene0451_03
+scene0451_04
+scene0451_05
+scene0135_00
+scene0065_00
+scene0065_01
+scene0065_02
+scene0104_00
+scene0674_00
+scene0674_01
+scene0448_00
+scene0448_01
+scene0448_02
+scene0502_00
+scene0502_01
+scene0502_02
+scene0440_00
+scene0440_01
+scene0440_02
+scene0071_00
+scene0072_00
+scene0072_01
+scene0072_02
+scene0509_00
+scene0509_01
+scene0509_02
+scene0649_00
+scene0649_01
+scene0602_00
+scene0694_00
+scene0694_01
+scene0101_00
+scene0101_01
+scene0101_02
+scene0101_03
+scene0101_04
+scene0101_05
+scene0218_00
+scene0218_01
+scene0579_00
+scene0579_01
+scene0579_02
+scene0039_00
+scene0039_01
+scene0493_00
+scene0493_01
+scene0242_00
+scene0242_01
+scene0242_02
+scene0083_00
+scene0083_01
+scene0127_00
+scene0127_01
+scene0662_00
+scene0662_01
+scene0662_02
+scene0018_00
+scene0087_00
+scene0087_01
+scene0087_02
+scene0332_00
+scene0332_01
+scene0332_02
+scene0628_00
+scene0628_01
+scene0628_02
+scene0134_00
+scene0134_01
+scene0134_02
+scene0238_00
+scene0238_01
+scene0092_00
+scene0092_01
+scene0092_02
+scene0092_03
+scene0092_04
+scene0022_00
+scene0022_01
+scene0467_00
+scene0392_00
+scene0392_01
+scene0392_02
+scene0424_00
+scene0424_01
+scene0424_02
+scene0646_00
+scene0646_01
+scene0646_02
+scene0098_00
+scene0098_01
+scene0044_00
+scene0044_01
+scene0044_02
+scene0510_00
+scene0510_01
+scene0510_02
+scene0571_00
+scene0571_01
+scene0166_00
+scene0166_01
+scene0166_02
+scene0563_00
+scene0172_00
+scene0172_01
+scene0388_00
+scene0388_01
+scene0215_00
+scene0215_01
+scene0252_00
+scene0287_00
+scene0668_00
+scene0572_00
+scene0572_01
+scene0572_02
+scene0026_00
+scene0224_00
+scene0113_00
+scene0113_01
+scene0551_00
+scene0381_00
+scene0381_01
+scene0381_02
+scene0371_00
+scene0371_01
+scene0460_00
+scene0118_00
+scene0118_01
+scene0118_02
+scene0417_00
+scene0008_00
+scene0634_00
+scene0521_00
+scene0123_00
+scene0123_01
+scene0123_02
+scene0045_00
+scene0045_01
+scene0511_00
+scene0511_01
+scene0114_00
+scene0114_01
+scene0114_02
+scene0070_00
+scene0029_00
+scene0029_01
+scene0029_02
+scene0129_00
+scene0103_00
+scene0103_01
+scene0002_00
+scene0002_01
+scene0132_00
+scene0132_01
+scene0132_02
+scene0124_00
+scene0124_01
+scene0143_00
+scene0143_01
+scene0143_02
+scene0604_00
+scene0604_01
+scene0604_02
+scene0507_00
+scene0105_00
+scene0105_01
+scene0105_02
+scene0428_00
+scene0428_01
+scene0311_00
+scene0140_00
+scene0140_01
+scene0182_00
+scene0182_01
+scene0182_02
+scene0142_00
+scene0142_01
+scene0399_00
+scene0399_01
+scene0012_00
+scene0012_01
+scene0012_02
+scene0060_00
+scene0060_01
+scene0370_00
+scene0370_01
+scene0370_02
+scene0310_00
+scene0310_01
+scene0310_02
+scene0661_00
+scene0650_00
+scene0152_00
+scene0152_01
+scene0152_02
+scene0158_00
+scene0158_01
+scene0158_02
+scene0482_00
+scene0482_01
+scene0600_00
+scene0600_01
+scene0600_02
+scene0393_00
+scene0393_01
+scene0393_02
+scene0562_00
+scene0174_00
+scene0174_01
+scene0157_00
+scene0157_01
+scene0161_00
+scene0161_01
+scene0161_02
+scene0159_00
+scene0254_00
+scene0254_01
+scene0115_00
+scene0115_01
+scene0115_02
+scene0162_00
+scene0163_00
+scene0163_01
+scene0523_00
+scene0523_01
+scene0523_02
+scene0459_00
+scene0459_01
+scene0175_00
+scene0085_00
+scene0085_01
+scene0279_00
+scene0279_01
+scene0279_02
+scene0201_00
+scene0201_01
+scene0201_02
+scene0283_00
+scene0456_00
+scene0456_01
+scene0429_00
+scene0043_00
+scene0043_01
+scene0419_00
+scene0419_01
+scene0419_02
+scene0368_00
+scene0368_01
+scene0348_00
+scene0348_01
+scene0348_02
+scene0442_00
+scene0178_00
+scene0380_00
+scene0380_01
+scene0380_02
+scene0165_00
+scene0165_01
+scene0165_02
+scene0181_00
+scene0181_01
+scene0181_02
+scene0181_03
+scene0333_00
+scene0614_00
+scene0614_01
+scene0614_02
+scene0404_00
+scene0404_01
+scene0404_02
+scene0185_00
+scene0126_00
+scene0126_01
+scene0126_02
+scene0519_00
+scene0236_00
+scene0236_01
+scene0189_00
+scene0075_00
+scene0267_00
+scene0192_00
+scene0192_01
+scene0192_02
+scene0281_00
+scene0420_00
+scene0420_01
+scene0420_02
+scene0195_00
+scene0195_01
+scene0195_02
+scene0597_00
+scene0597_01
+scene0597_02
+scene0041_00
+scene0041_01
+scene0111_00
+scene0111_01
+scene0111_02
+scene0666_00
+scene0666_01
+scene0666_02
+scene0200_00
+scene0200_01
+scene0200_02
+scene0536_00
+scene0536_01
+scene0536_02
+scene0390_00
+scene0280_00
+scene0280_01
+scene0280_02
+scene0344_00
+scene0344_01
+scene0205_00
+scene0205_01
+scene0205_02
+scene0484_00
+scene0484_01
+scene0009_00
+scene0009_01
+scene0009_02
+scene0302_00
+scene0302_01
+scene0209_00
+scene0209_01
+scene0209_02
+scene0210_00
+scene0210_01
+scene0395_00
+scene0395_01
+scene0395_02
+scene0683_00
+scene0601_00
+scene0601_01
+scene0214_00
+scene0214_01
+scene0214_02
+scene0477_00
+scene0477_01
+scene0439_00
+scene0439_01
+scene0468_00
+scene0468_01
+scene0468_02
+scene0546_00
+scene0466_00
+scene0466_01
+scene0220_00
+scene0220_01
+scene0220_02
+scene0122_00
+scene0122_01
+scene0130_00
+scene0110_00
+scene0110_01
+scene0110_02
+scene0327_00
+scene0156_00
+scene0266_00
+scene0266_01
+scene0001_00
+scene0001_01
+scene0228_00
+scene0199_00
+scene0219_00
+scene0464_00
+scene0232_00
+scene0232_01
+scene0232_02
+scene0299_00
+scene0299_01
+scene0530_00
+scene0363_00
+scene0453_00
+scene0453_01
+scene0570_00
+scene0570_01
+scene0570_02
+scene0183_00
+scene0239_00
+scene0239_01
+scene0239_02
+scene0373_00
+scene0373_01
+scene0241_00
+scene0241_01
+scene0241_02
+scene0188_00
+scene0622_00
+scene0622_01
+scene0244_00
+scene0244_01
+scene0691_00
+scene0691_01
+scene0206_00
+scene0206_01
+scene0206_02
+scene0247_00
+scene0247_01
+scene0061_00
+scene0061_01
+scene0082_00
+scene0250_00
+scene0250_01
+scene0250_02
+scene0501_00
+scene0501_01
+scene0501_02
+scene0320_00
+scene0320_01
+scene0320_02
+scene0320_03
+scene0631_00
+scene0631_01
+scene0631_02
+scene0255_00
+scene0255_01
+scene0255_02
+scene0047_00
+scene0265_00
+scene0265_01
+scene0265_02
+scene0004_00
+scene0336_00
+scene0336_01
+scene0058_00
+scene0058_01
+scene0260_00
+scene0260_01
+scene0260_02
+scene0243_00
+scene0603_00
+scene0603_01
+scene0093_00
+scene0093_01
+scene0093_02
+scene0109_00
+scene0109_01
+scene0434_00
+scene0434_01
+scene0434_02
+scene0290_00
+scene0627_00
+scene0627_01
+scene0470_00
+scene0470_01
+scene0137_00
+scene0137_01
+scene0137_02
+scene0270_00
+scene0270_01
+scene0270_02
+scene0271_00
+scene0271_01
+scene0504_00
+scene0274_00
+scene0274_01
+scene0274_02
+scene0036_00
+scene0036_01
+scene0276_00
+scene0276_01
+scene0272_00
+scene0272_01
+scene0499_00
+scene0698_00
+scene0698_01
+scene0051_00
+scene0051_01
+scene0051_02
+scene0051_03
+scene0108_00
+scene0245_00
+scene0369_00
+scene0369_01
+scene0369_02
+scene0284_00
+scene0289_00
+scene0289_01
+scene0286_00
+scene0286_01
+scene0286_02
+scene0286_03
+scene0031_00
+scene0031_01
+scene0031_02
+scene0545_00
+scene0545_01
+scene0545_02
+scene0557_00
+scene0557_01
+scene0557_02
+scene0533_00
+scene0533_01
+scene0116_00
+scene0116_01
+scene0116_02
+scene0611_00
+scene0611_01
+scene0688_00
+scene0294_00
+scene0294_01
+scene0294_02
+scene0295_00
+scene0295_01
+scene0296_00
+scene0296_01
+scene0596_00
+scene0596_01
+scene0596_02
+scene0532_00
+scene0532_01
+scene0637_00
+scene0638_00
+scene0121_00
+scene0121_01
+scene0121_02
+scene0040_00
+scene0040_01
+scene0197_00
+scene0197_01
+scene0197_02
+scene0410_00
+scene0410_01
+scene0305_00
+scene0305_01
+scene0615_00
+scene0615_01
+scene0703_00
+scene0703_01
+scene0555_00
+scene0297_00
+scene0297_01
+scene0297_02
+scene0582_00
+scene0582_01
+scene0582_02
+scene0023_00
+scene0094_00
+scene0013_00
+scene0013_01
+scene0013_02
+scene0136_00
+scene0136_01
+scene0136_02
+scene0407_00
+scene0407_01
+scene0062_00
+scene0062_01
+scene0062_02
+scene0386_00
+scene0318_00
+scene0554_00
+scene0554_01
+scene0497_00
+scene0213_00
+scene0258_00
+scene0323_00
+scene0323_01
+scene0324_00
+scene0324_01
+scene0016_00
+scene0016_01
+scene0016_02
+scene0681_00
+scene0398_00
+scene0398_01
+scene0227_00
+scene0090_00
+scene0066_00
+scene0262_00
+scene0262_01
+scene0155_00
+scene0155_01
+scene0155_02
+scene0352_00
+scene0352_01
+scene0352_02
+scene0038_00
+scene0038_01
+scene0038_02
+scene0335_00
+scene0335_01
+scene0335_02
+scene0261_00
+scene0261_01
+scene0261_02
+scene0261_03
+scene0640_00
+scene0640_01
+scene0640_02
+scene0080_00
+scene0080_01
+scene0080_02
+scene0403_00
+scene0403_01
+scene0282_00
+scene0282_01
+scene0282_02
+scene0682_00
+scene0173_00
+scene0173_01
+scene0173_02
+scene0522_00
+scene0687_00
+scene0345_00
+scene0345_01
+scene0612_00
+scene0612_01
+scene0411_00
+scene0411_01
+scene0411_02
+scene0625_00
+scene0625_01
+scene0211_00
+scene0211_01
+scene0211_02
+scene0211_03
+scene0676_00
+scene0676_01
+scene0179_00
+scene0498_00
+scene0498_01
+scene0498_02
+scene0547_00
+scene0547_01
+scene0547_02
+scene0269_00
+scene0269_01
+scene0269_02
+scene0366_00
+scene0680_00
+scene0680_01
+scene0588_00
+scene0588_01
+scene0588_02
+scene0588_03
+scene0346_00
+scene0346_01
+scene0359_00
+scene0359_01
+scene0014_00
+scene0120_00
+scene0120_01
+scene0212_00
+scene0212_01
+scene0212_02
+scene0176_00
+scene0049_00
+scene0259_00
+scene0259_01
+scene0586_00
+scene0586_01
+scene0586_02
+scene0309_00
+scene0309_01
+scene0125_00
+scene0455_00
+scene0177_00
+scene0177_01
+scene0177_02
+scene0326_00
+scene0372_00
+scene0171_00
+scene0171_01
+scene0374_00
+scene0654_00
+scene0654_01
+scene0445_00
+scene0445_01
+scene0475_00
+scene0475_01
+scene0475_02
+scene0349_00
+scene0349_01
+scene0234_00
+scene0669_00
+scene0669_01
+scene0375_00
+scene0375_01
+scene0375_02
+scene0387_00
+scene0387_01
+scene0387_02
+scene0312_00
+scene0312_01
+scene0312_02
+scene0384_00
+scene0385_00
+scene0385_01
+scene0385_02
+scene0000_00
+scene0000_01
+scene0000_02
+scene0376_00
+scene0376_01
+scene0376_02
+scene0301_00
+scene0301_01
+scene0301_02
+scene0322_00
+scene0542_00
+scene0079_00
+scene0079_01
+scene0099_00
+scene0099_01
+scene0476_00
+scene0476_01
+scene0476_02
+scene0394_00
+scene0394_01
+scene0147_00
+scene0147_01
+scene0067_00
+scene0067_01
+scene0067_02
+scene0397_00
+scene0397_01
+scene0337_00
+scene0337_01
+scene0337_02
+scene0431_00
+scene0223_00
+scene0223_01
+scene0223_02
+scene0010_00
+scene0010_01
+scene0402_00
+scene0268_00
+scene0268_01
+scene0268_02
+scene0679_00
+scene0679_01
+scene0405_00
+scene0128_00
+scene0408_00
+scene0408_01
+scene0190_00
+scene0107_00
+scene0076_00
+scene0167_00
+scene0361_00
+scene0361_01
+scene0361_02
+scene0216_00
+scene0202_00
+scene0303_00
+scene0303_01
+scene0303_02
+scene0446_00
+scene0446_01
+scene0089_00
+scene0089_01
+scene0089_02
+scene0360_00
+scene0150_00
+scene0150_01
+scene0150_02
+scene0421_00
+scene0421_01
+scene0421_02
+scene0454_00
+scene0626_00
+scene0626_01
+scene0626_02
+scene0186_00
+scene0186_01
+scene0538_00
+scene0479_00
+scene0479_01
+scene0479_02
+scene0656_00
+scene0656_01
+scene0656_02
+scene0656_03
+scene0525_00
+scene0525_01
+scene0525_02
+scene0308_00
+scene0396_00
+scene0396_01
+scene0396_02
+scene0624_00
+scene0292_00
+scene0292_01
+scene0632_00
+scene0253_00
+scene0021_00
+scene0325_00
+scene0325_01
+scene0437_00
+scene0437_01
+scene0438_00
+scene0590_00
+scene0590_01
+scene0400_00
+scene0400_01
+scene0541_00
+scene0541_01
+scene0541_02
+scene0677_00
+scene0677_01
+scene0677_02
+scene0443_00
+scene0315_00
+scene0288_00
+scene0288_01
+scene0288_02
+scene0422_00
+scene0672_00
+scene0672_01
+scene0184_00
+scene0449_00
+scene0449_01
+scene0449_02
+scene0048_00
+scene0048_01
+scene0138_00
+scene0452_00
+scene0452_01
+scene0452_02
+scene0667_00
+scene0667_01
+scene0667_02
+scene0463_00
+scene0463_01
+scene0078_00
+scene0078_01
+scene0078_02
+scene0636_00
+scene0457_00
+scene0457_01
+scene0457_02
+scene0465_00
+scene0465_01
+scene0577_00
+scene0151_00
+scene0151_01
+scene0339_00
+scene0573_00
+scene0573_01
+scene0154_00
+scene0096_00
+scene0096_01
+scene0096_02
+scene0235_00
+scene0168_00
+scene0168_01
+scene0168_02
+scene0594_00
+scene0587_00
+scene0587_01
+scene0587_02
+scene0587_03
+scene0229_00
+scene0229_01
+scene0229_02
+scene0512_00
+scene0106_00
+scene0106_01
+scene0106_02
+scene0472_00
+scene0472_01
+scene0472_02
+scene0489_00
+scene0489_01
+scene0489_02
+scene0425_00
+scene0425_01
+scene0641_00
+scene0526_00
+scene0526_01
+scene0317_00
+scene0317_01
+scene0544_00
+scene0017_00
+scene0017_01
+scene0017_02
+scene0042_00
+scene0042_01
+scene0042_02
+scene0576_00
+scene0576_01
+scene0576_02
+scene0347_00
+scene0347_01
+scene0347_02
+scene0436_00
+scene0226_00
+scene0226_01
+scene0485_00
+scene0486_00
+scene0487_00
+scene0487_01
+scene0619_00
+scene0097_00
+scene0367_00
+scene0367_01
+scene0491_00
+scene0492_00
+scene0492_01
+scene0005_00
+scene0005_01
+scene0543_00
+scene0543_01
+scene0543_02
+scene0657_00
+scene0341_00
+scene0341_01
+scene0534_00
+scene0534_01
+scene0319_00
+scene0273_00
+scene0273_01
+scene0225_00
+scene0198_00
+scene0003_00
+scene0003_01
+scene0003_02
+scene0409_00
+scene0409_01
+scene0331_00
+scene0331_01
+scene0505_00
+scene0505_01
+scene0505_02
+scene0505_03
+scene0505_04
+scene0506_00
+scene0057_00
+scene0057_01
+scene0074_00
+scene0074_01
+scene0074_02
+scene0091_00
+scene0112_00
+scene0112_01
+scene0112_02
+scene0240_00
+scene0102_00
+scene0102_01
+scene0513_00
+scene0514_00
+scene0514_01
+scene0537_00
+scene0516_00
+scene0516_01
+scene0495_00
+scene0617_00
+scene0133_00
+scene0520_00
+scene0520_01
+scene0635_00
+scene0635_01
+scene0054_00
+scene0473_00
+scene0473_01
+scene0524_00
+scene0524_01
+scene0379_00
+scene0471_00
+scene0471_01
+scene0471_02
+scene0566_00
+scene0248_00
+scene0248_01
+scene0248_02
+scene0529_00
+scene0529_01
+scene0529_02
+scene0391_00
+scene0264_00
+scene0264_01
+scene0264_02
+scene0675_00
+scene0675_01
+scene0350_00
+scene0350_01
+scene0350_02
+scene0450_00
+scene0068_00
+scene0068_01
+scene0237_00
+scene0237_01
+scene0365_00
+scene0365_01
+scene0365_02
+scene0605_00
+scene0605_01
+scene0539_00
+scene0539_01
+scene0539_02
+scene0540_00
+scene0540_01
+scene0540_02
+scene0170_00
+scene0170_01
+scene0170_02
+scene0433_00
+scene0340_00
+scene0340_01
+scene0340_02
+scene0160_00
+scene0160_01
+scene0160_02
+scene0160_03
+scene0160_04
+scene0059_00
+scene0059_01
+scene0059_02
+scene0056_00
+scene0056_01
+scene0478_00
+scene0478_01
+scene0548_00
+scene0548_01
+scene0548_02
+scene0204_00
+scene0204_01
+scene0204_02
+scene0033_00
+scene0145_00
+scene0483_00
+scene0508_00
+scene0508_01
+scene0508_02
+scene0180_00
+scene0148_00
+scene0556_00
+scene0556_01
+scene0416_00
+scene0416_01
+scene0416_02
+scene0416_03
+scene0416_04
+scene0073_00
+scene0073_01
+scene0073_02
+scene0073_03
+scene0034_00
+scene0034_01
+scene0034_02
+scene0639_00
+scene0561_00
+scene0561_01
+scene0298_00
+scene0692_00
+scene0692_01
+scene0692_02
+scene0692_03
+scene0692_04
+scene0642_00
+scene0642_01
+scene0642_02
+scene0642_03
+scene0630_00
+scene0630_01
+scene0630_02
+scene0630_03
+scene0630_04
+scene0630_05
+scene0630_06
+scene0706_00
+scene0567_00
+scene0567_01
diff --git a/data/scannet/meta_data/scannetv2_val.txt b/data/scannet/meta_data/scannetv2_val.txt
new file mode 100644
index 0000000..b9e7d92
--- /dev/null
+++ b/data/scannet/meta_data/scannetv2_val.txt
@@ -0,0 +1,312 @@
+scene0568_00
+scene0568_01
+scene0568_02
+scene0304_00
+scene0488_00
+scene0488_01
+scene0412_00
+scene0412_01
+scene0217_00
+scene0019_00
+scene0019_01
+scene0414_00
+scene0575_00
+scene0575_01
+scene0575_02
+scene0426_00
+scene0426_01
+scene0426_02
+scene0426_03
+scene0549_00
+scene0549_01
+scene0578_00
+scene0578_01
+scene0578_02
+scene0665_00
+scene0665_01
+scene0050_00
+scene0050_01
+scene0050_02
+scene0257_00
+scene0025_00
+scene0025_01
+scene0025_02
+scene0583_00
+scene0583_01
+scene0583_02
+scene0701_00
+scene0701_01
+scene0701_02
+scene0580_00
+scene0580_01
+scene0565_00
+scene0169_00
+scene0169_01
+scene0655_00
+scene0655_01
+scene0655_02
+scene0063_00
+scene0221_00
+scene0221_01
+scene0591_00
+scene0591_01
+scene0591_02
+scene0678_00
+scene0678_01
+scene0678_02
+scene0462_00
+scene0427_00
+scene0595_00
+scene0193_00
+scene0193_01
+scene0164_00
+scene0164_01
+scene0164_02
+scene0164_03
+scene0598_00
+scene0598_01
+scene0598_02
+scene0599_00
+scene0599_01
+scene0599_02
+scene0328_00
+scene0300_00
+scene0300_01
+scene0354_00
+scene0458_00
+scene0458_01
+scene0423_00
+scene0423_01
+scene0423_02
+scene0307_00
+scene0307_01
+scene0307_02
+scene0606_00
+scene0606_01
+scene0606_02
+scene0432_00
+scene0432_01
+scene0608_00
+scene0608_01
+scene0608_02
+scene0651_00
+scene0651_01
+scene0651_02
+scene0430_00
+scene0430_01
+scene0689_00
+scene0357_00
+scene0357_01
+scene0574_00
+scene0574_01
+scene0574_02
+scene0329_00
+scene0329_01
+scene0329_02
+scene0153_00
+scene0153_01
+scene0616_00
+scene0616_01
+scene0671_00
+scene0671_01
+scene0618_00
+scene0382_00
+scene0382_01
+scene0490_00
+scene0621_00
+scene0607_00
+scene0607_01
+scene0149_00
+scene0695_00
+scene0695_01
+scene0695_02
+scene0695_03
+scene0389_00
+scene0377_00
+scene0377_01
+scene0377_02
+scene0342_00
+scene0139_00
+scene0629_00
+scene0629_01
+scene0629_02
+scene0496_00
+scene0633_00
+scene0633_01
+scene0518_00
+scene0652_00
+scene0406_00
+scene0406_01
+scene0406_02
+scene0144_00
+scene0144_01
+scene0494_00
+scene0278_00
+scene0278_01
+scene0316_00
+scene0609_00
+scene0609_01
+scene0609_02
+scene0609_03
+scene0084_00
+scene0084_01
+scene0084_02
+scene0696_00
+scene0696_01
+scene0696_02
+scene0351_00
+scene0351_01
+scene0643_00
+scene0644_00
+scene0645_00
+scene0645_01
+scene0645_02
+scene0081_00
+scene0081_01
+scene0081_02
+scene0647_00
+scene0647_01
+scene0535_00
+scene0353_00
+scene0353_01
+scene0353_02
+scene0559_00
+scene0559_01
+scene0559_02
+scene0593_00
+scene0593_01
+scene0246_00
+scene0653_00
+scene0653_01
+scene0064_00
+scene0064_01
+scene0356_00
+scene0356_01
+scene0356_02
+scene0030_00
+scene0030_01
+scene0030_02
+scene0222_00
+scene0222_01
+scene0338_00
+scene0338_01
+scene0338_02
+scene0378_00
+scene0378_01
+scene0378_02
+scene0660_00
+scene0553_00
+scene0553_01
+scene0553_02
+scene0527_00
+scene0663_00
+scene0663_01
+scene0663_02
+scene0664_00
+scene0664_01
+scene0664_02
+scene0334_00
+scene0334_01
+scene0334_02
+scene0046_00
+scene0046_01
+scene0046_02
+scene0203_00
+scene0203_01
+scene0203_02
+scene0088_00
+scene0088_01
+scene0088_02
+scene0088_03
+scene0086_00
+scene0086_01
+scene0086_02
+scene0670_00
+scene0670_01
+scene0256_00
+scene0256_01
+scene0256_02
+scene0249_00
+scene0441_00
+scene0658_00
+scene0704_00
+scene0704_01
+scene0187_00
+scene0187_01
+scene0131_00
+scene0131_01
+scene0131_02
+scene0207_00
+scene0207_01
+scene0207_02
+scene0461_00
+scene0011_00
+scene0011_01
+scene0343_00
+scene0251_00
+scene0077_00
+scene0077_01
+scene0684_00
+scene0684_01
+scene0550_00
+scene0686_00
+scene0686_01
+scene0686_02
+scene0208_00
+scene0500_00
+scene0500_01
+scene0552_00
+scene0552_01
+scene0648_00
+scene0648_01
+scene0435_00
+scene0435_01
+scene0435_02
+scene0435_03
+scene0690_00
+scene0690_01
+scene0693_00
+scene0693_01
+scene0693_02
+scene0700_00
+scene0700_01
+scene0700_02
+scene0699_00
+scene0231_00
+scene0231_01
+scene0231_02
+scene0697_00
+scene0697_01
+scene0697_02
+scene0697_03
+scene0474_00
+scene0474_01
+scene0474_02
+scene0474_03
+scene0474_04
+scene0474_05
+scene0355_00
+scene0355_01
+scene0146_00
+scene0146_01
+scene0146_02
+scene0196_00
+scene0702_00
+scene0702_01
+scene0702_02
+scene0314_00
+scene0277_00
+scene0277_01
+scene0277_02
+scene0095_00
+scene0095_01
+scene0015_00
+scene0100_00
+scene0100_01
+scene0100_02
+scene0558_00
+scene0558_01
+scene0558_02
+scene0685_00
+scene0685_01
+scene0685_02
diff --git a/data/scannet/scannet_utils.py b/data/scannet/scannet_utils.py
new file mode 100644
index 0000000..46e160b
--- /dev/null
+++ b/data/scannet/scannet_utils.py
@@ -0,0 +1,87 @@
+# Modified from
+# https://github.com/facebookresearch/votenet/blob/master/scannet/scannet_utils.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Ref: https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts
+"""
+
+import csv
+import os
+
+import numpy as np
+from plyfile import PlyData
+
+
+def represents_int(s):
+    """Judge whether string s represents an int.
+
+    Args:
+        s(str): The input string to be judged.
+
+    Returns:
+        bool: Whether s represents int or not.
+    """
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+
+def read_label_mapping(filename,
+                       label_from='raw_category',
+                       label_to='nyu40id'):
+    assert os.path.isfile(filename)
+    mapping = dict()
+    with open(filename) as csvfile:
+        reader = csv.DictReader(csvfile, delimiter='\t')
+        for row in reader:
+            mapping[row[label_from]] = int(row[label_to])
+    if represents_int(list(mapping.keys())[0]):
+        mapping = {int(k): v for k, v in mapping.items()}
+    return mapping
+
+
+def read_mesh_vertices(filename):
+    """Read XYZ for each vertex.
+
+    Args:
+        filename(str): The name of the mesh vertices file.
+
+    Returns:
+        ndarray: Vertices.
+    """
+    assert os.path.isfile(filename)
+    with open(filename, 'rb') as f:
+        plydata = PlyData.read(f)
+        num_verts = plydata['vertex'].count
+        vertices = np.zeros(shape=[num_verts, 3], dtype=np.float32)
+        vertices[:, 0] = plydata['vertex'].data['x']
+        vertices[:, 1] = plydata['vertex'].data['y']
+        vertices[:, 2] = plydata['vertex'].data['z']
+    return vertices
+
+
+def read_mesh_vertices_rgb(filename):
+    """Read XYZ and RGB for each vertex.
+
+    Args:
+        filename(str): The name of the mesh vertices file.
+
+    Returns:
+        Vertices. Note that RGB values are in 0-255.
+    """
+    assert os.path.isfile(filename)
+    with open(filename, 'rb') as f:
+        plydata = PlyData.read(f)
+        num_verts = plydata['vertex'].count
+        vertices = np.zeros(shape=[num_verts, 6], dtype=np.float32)
+        vertices[:, 0] = plydata['vertex'].data['x']
+        vertices[:, 1] = plydata['vertex'].data['y']
+        vertices[:, 2] = plydata['vertex'].data['z']
+        vertices[:, 3] = plydata['vertex'].data['red']
+        vertices[:, 4] = plydata['vertex'].data['green']
+        vertices[:, 5] = plydata['vertex'].data['blue']
+    return vertices
diff --git a/data/structured3d/README.md b/data/structured3d/README.md
new file mode 100644
index 0000000..545f57f
--- /dev/null
+++ b/data/structured3d/README.md
@@ -0,0 +1,69 @@
+## Prepare Structured3D Data for 3D Indoor Instance Segmentation
+
+1. Download panorama zip files and 3d bounding box and annotations zip file to separate folders from the official [Structured3D](https://github.com/bertjiazheng/Structured3D).
+
+After this step the data are expected to be in the following structure:
+
+```
+panorama_folder
+├── Structured3D_panorama_xx.zip
+bb_folder
+├── Structured3D_bbox.zip
+```
+
+Unzip data by running our script:
+
+```
+python unzip.py --panorama-root panorama_folder --output-panorama-root panorama_folder_unziped --bb-root bb_folder --output-bb-root bb_folder_unziped
+```
+
+After this step you have the following file structure here:
+```
+panorama_folder_unziped
+├── xxxxxxxx
+│   ├── Structured3D
+│   │   ├── scene_xxxxx
+bb_folder_unziped
+├── Structured3D
+│   ├── scene_xxxxx
+```
+2. Preprocess data for offline benchmark by running our script:
+
+```
+python data_prepare.py --panorama-root panorama_folder_unziped --bb-root bb_folder_unziped/Structured3D/ --bins-root bins
+
+```
+After this step you have the following file structure here:
+```
+bins
+├── bboxs
+│   ├── scene_xxxxx_xx.npy
+├── instance_mask
+│   ├── scene_xxxxx_xx.bin
+├── points
+│   ├── scene_xxxxx_xx.bin
+├── semantic_mask
+│   ├── scene_xxxxx_xx.bin
+```
+
+3. Generate final pkl data by running:
+
+```
+python structured3d_data_utils.py --bins-root bins
+```
+Overall you achieve the following file structure in `bins` directory:
+```
+bins
+├── bboxs
+│   ├── scene_xxxxx_xx.npy
+├── instance_mask
+│   ├── scene_xxxxx_xx.bin
+├── points
+│   ├── scene_xxxxx_xx.bin
+├── semantic_mask
+│   ├── scene_xxxxx_xx.bin
+├── structured3d_infos_train.pkl
+├── structured3d_infos_val.pkl
+├── structured3d_infos_test.pkl
+```
+
diff --git a/data/structured3d/data_prepare.py b/data/structured3d/data_prepare.py
new file mode 100644
index 0000000..22d50c3
--- /dev/null
+++ b/data/structured3d/data_prepare.py
@@ -0,0 +1,73 @@
+import argparse
+import os
+import numpy as np
+from utils import Structured3DScene
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--panorama-root',
+        required=True,
+        help='Folder with panorama scenes',
+        type=str)
+    parser.add_argument(
+        '--bb-root',
+        required=True,
+        help='Folder with bb scenes',
+        type=str)
+    parser.add_argument(
+        '--bins-root',
+        required=True,
+        help='Folder with bin files',
+        type=str)
+
+    args = parser.parse_args()
+    if not os.path.exists(args.bins_root):
+        os.mkdir(args.bins_root)
+
+    inst_bins_path = os.path.join(args.bins_root, 'instance_mask')
+    pc_bins_path = os.path.join(args.bins_root, 'points')
+    sem_bins_path = os.path.join(args.bins_root, 'semantic_mask')
+    bb_bins_path = os.path.join(args.bins_root, 'bboxs')
+
+    if not os.path.exists(inst_bins_path):
+        os.mkdir(inst_bins_path)
+
+    if not os.path.exists(pc_bins_path):
+        os.mkdir(pc_bins_path)
+
+    if not os.path.exists(sem_bins_path):
+        os.mkdir(sem_bins_path)
+
+    if not os.path.exists(bb_bins_path):
+        os.mkdir(bb_bins_path)
+
+    sorted_scene_folder = sorted(
+        os.listdir(args.panorama_root), key=lambda x: int(x))
+
+    for scenes in sorted_scene_folder:
+        path_to_scenes = os.path.join(
+            args.panorama_root, scenes, 'Structured3D')
+        scenes = sorted(os.listdir(path_to_scenes), key=lambda x: int(x[-5:]))
+        for scene in scenes:
+            scene_id = int(scene[-5:])
+            data = Structured3DScene(
+                path_to_scenes, args.bb_root, 'full', scene_id)
+            room_nums = len(data.point_cloud['point_clouds'])
+            for idx in range(room_nums):
+                data.point_cloud['point_clouds'][idx].astype(
+                    np.float32).tofile(
+                        os.path.join(pc_bins_path, f'{scene}_{idx}.bin'))
+
+                data.point_cloud['labels'][idx].astype(np.int64).tofile(
+                    os.path.join(sem_bins_path, f'{scene}_{idx}.bin'))
+
+                data.point_cloud['instances'][idx].astype(np.int64).tofile(
+                    os.path.join(inst_bins_path, f'{scene}_{idx}.bin'))
+
+                np.save(
+                    os.path.join(bb_bins_path, f'{scene}_{idx}.npy'),
+                    data.point_cloud['bboxs'][idx])
+
+            print(f'{scene} is processed')
diff --git a/data/structured3d/structured3d_data_utils.py b/data/structured3d/structured3d_data_utils.py
new file mode 100644
index 0000000..f0c593c
--- /dev/null
+++ b/data/structured3d/structured3d_data_utils.py
@@ -0,0 +1,265 @@
+import os
+import mmengine
+import numpy as np
+import argparse
+
+
+class Structured3DData:
+    """Structured3DData.
+
+    Args:
+        bins_path (str): Root where all bins files are stored.
+        point_folder (str): Folder where point bins are stored.
+            Defaults to 'points'.
+        inst_folder (str): Folder where instance_mask bins are stored.
+            Defaults to 'instance_mask'.
+        sem_folder (str): Folder where semantic_mask bins are stored.
+            Defaults to 'semantic_mask'.
+        train_scene_end (str): The last train scene .
+            Defaults to 'scene_03000'.
+        val_scene_end (str): The last val scene.
+            Defaults to 'scene_03250'.
+        is_test_needed (bool): Whether or not create test dataset.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 bins_path,
+                 point_folder='points',
+                 inst_folder='instance_mask',
+                 sem_folder='semantic_mask',
+                 bboxs_folder='bboxs',
+                 train_scene_end='scene_03000',
+                 val_scene_end='scene_03250',
+                 is_test_needed=True):
+        assert os.path.exists(bins_path)
+        points_path = os.path.join(bins_path, point_folder)
+        inst_path = os.path.join(bins_path, inst_folder)
+        sem_path = os.path.join(bins_path, sem_folder)
+        self.bb_path = os.path.join(bins_path, bboxs_folder)
+        assert os.path.exists(
+            points_path), f'Path to point bins: {points_path} does not exist'
+        assert os.path.exists(
+            inst_path), f'Path to instance bins: {inst_path} does not exist'
+        assert os.path.exists(
+            sem_path), f'Path to semantic bins: {sem_path} does not exist'
+        assert os.path.exists(
+            self.bb_path), f'Path to bboxs npy: {self.bb_path} does not exist'
+        self.classes = [
+            'unknown', 'wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa',
+            'table', 'door', 'window', 'bookshelf', 'picture', 'counter',
+            'blinds', 'desk', 'shelves', 'curtain', 'dresser', 'pillow',
+            'mirror', 'floor mat', 'clothes', 'ceiling', 'books', 'fridge',
+            'television', 'paper', 'towel', 'shower curtain', 'box',
+            'whiteboard', 'person', 'night stand', 'toilet', 'sink', 'lamp',
+            'bathtub', 'bag', 'structure', 'furniture', 'prop'
+        ]
+
+        self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+        self.label2cat = {self.cat2label[t]: t for t in self.cat2label}
+        self.points = np.array(
+            sorted(
+                os.listdir(points_path), key=lambda x: int(x.split('_')[1])))
+        self.insts = np.array(
+            sorted(os.listdir(inst_path), key=lambda x: int(x.split('_')[1])))
+        self.sems = np.array(
+            sorted(os.listdir(sem_path), key=lambda x: int(x.split('_')[1])))
+
+        if is_test_needed:
+            self.train_dataset_points, self.val_dataset_points, \
+                self.test_dataset_points = self.get_dataset(
+                    self.points, train_scene_end, val_scene_end,
+                    is_test_needed)
+            self.train_dataset_insts, self.val_dataset_insts, \
+                self.test_dataset_insts = self.get_dataset(
+                    self.insts, train_scene_end, val_scene_end, is_test_needed)
+            self.train_dataset_sems, self.val_dataset_sems, \
+                self.test_dataset_sems = self.get_dataset(
+                    self.sems, train_scene_end, val_scene_end, is_test_needed)
+            self.test_dataset = np.hstack([
+                self.test_dataset_points.reshape(-1, 1),
+                self.test_dataset_sems.reshape(-1, 1),
+                self.test_dataset_insts.reshape(-1, 1)
+            ])
+        else:
+            self.train_dataset_points, self.val_dataset_points = \
+                self.get_dataset(
+                    self.points, train_scene_end, val_scene_end, is_test_needed)
+            self.train_dataset_insts, self.val_dataset_insts = \
+                self.get_dataset(
+                    self.insts, train_scene_end, val_scene_end, is_test_needed)
+            self.train_dataset_sems, self.val_dataset_sems = \
+                self.get_dataset(
+                    self.sems, train_scene_end, val_scene_end, is_test_needed)
+
+        self.train_dataset = np.hstack([
+            self.train_dataset_points.reshape(-1, 1),
+            self.train_dataset_sems.reshape(-1, 1),
+            self.train_dataset_insts.reshape(-1, 1)
+        ])
+        self.val_dataset = np.hstack([
+            self.val_dataset_points.reshape(-1, 1),
+            self.val_dataset_sems.reshape(-1, 1),
+            self.val_dataset_insts.reshape(-1, 1)
+        ])
+        self.datasets = {'train': self.train_dataset, 'val': self.val_dataset}
+
+        if is_test_needed:
+            self.datasets['test'] = self.test_dataset
+
+    def __len__(self):
+        return len(self.points)
+
+    def get_idx(self, path, train_scene_end, val_scene_end):
+        """Get indexes.
+        This method gets indexes for train and val datasets.
+
+        Args:
+            path (str): Path to the folder with bins.
+            train_scene_end (str): The last train scene.
+            val_scene_end (str): The last val scene.
+
+        Returns:
+            int: Train index
+            int: Val index
+            
+        """
+        train_flag = True
+        val_flag = True
+        for idx, f in enumerate(path):
+            if f.startswith(train_scene_end) and train_flag:
+                train_idx = idx
+                train_flag = False
+
+            if f.startswith(val_scene_end) and val_flag:
+                val_idx = idx
+                val_flag = False
+
+        return train_idx, val_idx
+
+    def get_dataset(self,
+                    path,
+                    train_scene_end,
+                    val_scene_end,
+                    is_test_needed=True):
+        """Get datasets
+        This method gets train, validation and test if needed datasets
+
+        Args:
+            path (str): Path to the folder with bins
+            train_scene_end (str): The last train scene 
+            val_scene_end (str): The last val scene
+            is_test_needed (bool): Whether or not create test dataset
+                Defaults to True
+
+        Returns:
+            np.ndarray: Train dataset
+            np.ndarray: Validtion dataset
+            np.ndarray or None: Test dataset
+        """
+        train_idx, val_idx = self.get_idx(path, train_scene_end, val_scene_end)
+        train_dataset = path[:train_idx]
+        if is_test_needed:
+            val_dataset = path[train_idx:val_idx]
+            test_dataset = path[val_idx:]
+            return np.array(train_dataset), np.array(val_dataset), \
+                np.array(test_dataset)
+
+        else:
+            val_dataset = path[train_idx:]
+            return np.array(train_dataset), np.array(val_dataset)
+
+    def get_instances(self, sample_idx):
+        """Get instances
+        This method gets instances for the room
+
+        Args:
+            sample_idx (str): Sample_idx of the room
+
+        Returns:
+            List[dict]: Instances for the room
+        """
+        instances = []
+        path = os.path.join(self.bb_path, f'{sample_idx}.npy')
+        raw_bboxs = np.load(path)
+        for i in raw_bboxs:
+            bbox = i[:-1].copy()
+            if bbox[3] == 0 or bbox[4] == 0 or bbox[5] == 0:
+                continue
+            bbox[3:] = bbox[3:] * 2
+            instances.append({
+                'bbox_3d': (bbox).tolist(),
+                'bbox_label_3d': int(i[-1])
+            })
+
+        return instances
+
+    def get_data_list(self, split='train'):
+        """Get data list.
+        This method gets data list for the dataset.
+
+        Args:
+            split (str): 'train', 'val' or 'test'. Defaults to 'train'.
+
+        Returns:
+            List[dict]: Data list for the dataset.
+        """
+        data_list = []
+        dataset = self.datasets[split]
+        for f in dataset:
+            data_list.append({
+                'lidar_points': {
+                    'num_pts_feats': 6,
+                    'lidar_path': f[0]
+                },
+                'instances': self.get_instances(f[0].split('.')[0]),
+                'pts_semantic_mask_path': f[1],
+                'pts_instance_mask_path': f[2],
+                'axis_align_matrix': np.eye(4)
+            })
+        return data_list
+
+    def get_anno(self, split='train'):
+        """Get data list.
+        This method gets annotations for the dataset.
+
+        Args:
+            split (str): 'train', 'val' or 'test'. Defaults to 'train'.
+
+        Returns:
+            dict: Annotations for the dataset.
+        """
+        anno = {
+            'metainfo': {
+                'categories': self.cat2label,
+                'dataset': 'Structured3D',
+                'info_version': '1.0'
+            }
+        }
+
+        anno['data_list'] = self.get_data_list(split)
+        return anno
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--bins-root',
+        required=True,
+        help='Enter here the path to the bins folder',
+        type=str)
+    args = parser.parse_args()
+    pkl_prefix = 'structured3d'
+    dataset = Structured3DData(args.bins_root)
+    train_anno = dataset.get_anno(split='train')
+    val_anno = dataset.get_anno(split='val')
+    test_anno = dataset.get_anno(split='test')
+    filename_train = os.path.join(
+        args.bins_root, f'{pkl_prefix}_infos_train.pkl')
+    filename_val = os.path.join(
+        args.bins_root, f'{pkl_prefix}_infos_val.pkl')
+    filename_test = os.path.join(
+        args.bins_root, f'{pkl_prefix}_infos_test.pkl')
+    mmengine.dump(train_anno, filename_train, 'pkl')
+    mmengine.dump(val_anno, filename_val, 'pkl')
+    mmengine.dump(test_anno, filename_test, 'pkl')
diff --git a/data/structured3d/unzip.py b/data/structured3d/unzip.py
new file mode 100644
index 0000000..ce41c71
--- /dev/null
+++ b/data/structured3d/unzip.py
@@ -0,0 +1,57 @@
+import argparse
+import os
+import zipfile
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--panorama-root',
+        required=True,
+        help='Folder with panorama archives',
+        type=str)
+    parser.add_argument(
+        '--output-panorama-root',
+        required=True,
+        help='Folder with unziped panoramas',
+        type=str)
+    parser.add_argument(
+        '--bb-root',
+        required=True,
+        help='Folder with 3d bounding boxes and annotations',
+        type=str)
+    parser.add_argument(
+        '--output-bb-root',
+        required=True,
+        help='Folder for unziped 3d bounding boxes and annotations',
+        type=str)
+
+    args = parser.parse_args()
+    if not os.path.exists(args.output_panorama_root):
+        os.mkdir(args.output_panorama_root)
+
+    if not os.path.exists(args.output_bb_root):
+        os.mkdir(args.output_bb_root)
+
+    sorted_panorama_root = sorted(
+        os.listdir(args.panorama_root),
+        key=lambda x: int(x[:-4].split('_')[-1]))
+    for idx, zip in enumerate(sorted_panorama_root):
+        print(idx, zip)
+        out_path = os.path.join(args.output_panorama_root, str(idx))
+        with zipfile.ZipFile(
+            os.path.join(args.panorama_root, zip), 'r') as z:
+            for name in z.namelist():
+                try:
+                    z.extract(name, out_path)
+                except Exception as e:
+                    print(e)
+
+        print(f'File - {zip} is successfully unziped to the {out_path} folder')
+
+    with zipfile.ZipFile(os.path.join(
+        args.bb_root, os.listdir(args.bb_root)[0])) as z:
+        z.extractall(args.output_bb_root)
+        print(
+            f'File - {os.listdir(args.bb_root)[0]} is successfully '
+            f'unziped to the {args.output_bb_root} folder')
diff --git a/data/structured3d/utils.py b/data/structured3d/utils.py
new file mode 100644
index 0000000..22a80f6
--- /dev/null
+++ b/data/structured3d/utils.py
@@ -0,0 +1,248 @@
+import os
+import cv2
+import numpy as np
+
+
+COLOR_TO_LABEL = {
+    (0, 0, 0): 'unknown',
+    (174, 199, 232): 'wall',
+    (152, 223, 138): 'floor',
+    (31, 119, 180): 'cabinet',
+    (255, 187, 120): 'bed',
+    (188, 189, 34): 'chair',
+    (140, 86, 75): 'sofa',
+    (255, 152, 150): 'table',
+    (214, 39, 40): 'door',
+    (197, 176, 213): 'window',
+    (148, 103, 189): 'bookshelf',
+    (196, 156, 148): 'picture',
+    (23, 190, 207): 'counter',
+    (178, 76, 76): 'blinds',
+    (247, 182, 210): 'desk',
+    (66, 188, 102): 'shelves',
+    (219, 219, 141): 'curtain',
+    (140, 57, 197): 'dresser',
+    (202, 185, 52): 'pillow',
+    (51, 176, 203): 'mirror',
+    (200, 54, 131): 'floor mat',
+    (92, 193, 61): 'clothes',
+    (78, 71, 183): 'ceiling',
+    (172, 114, 82): 'books',
+    (255, 127, 14): 'fridge',
+    (91, 163, 138): 'television',
+    (153, 98, 156): 'paper',
+    (140, 153, 101): 'towel',
+    (158, 218, 229): 'shower curtain',
+    (100, 125, 154): 'box',
+    (178, 127, 135): 'whiteboard',
+    (120, 185, 128): 'person',
+    (146, 111, 194): 'night stand',
+    (44, 160, 44): 'toilet',
+    (112, 128, 144): 'sink',
+    (96, 207, 209): 'lamp',
+    (227, 119, 194): 'bathtub',
+    (213, 92, 176): 'bag',
+    (94, 106, 211): 'structure',
+    (82, 84, 163): 'furniture',
+    (100, 85, 144): 'prop'
+}
+
+colors_and_ids = {k: i for i, (k, s) in enumerate(COLOR_TO_LABEL.items())}
+rgbs = np.array(list(colors_and_ids.keys()))
+ids = np.array(list(colors_and_ids.values()))
+mapping = np.zeros(shape=(256, 256, 256))
+mapping[rgbs[:, 0], rgbs[:, 1], rgbs[:, 2]] = ids
+
+
+class Structured3DScene():
+    """Structured3DScene
+
+    Args:
+        path_to_scenes (str): Root path to the unziped scenes.
+        path_to_bb (str): Root to the unziped bounding boxes
+            and annotations data.
+        resolution (str): The resolution of the images.
+        scene_id (int): Scene index.
+    """
+
+    def __init__(self, path_to_scenes, path_to_bb, resolution, scene_id):
+        self.resolution = resolution
+        self.path_to_bb = path_to_bb
+        path = path_to_scenes
+        scene_id = f'{scene_id:05d}'
+        self.scene_id = scene_id
+        self.scene_path = os.path.join(
+            path, f'scene_{scene_id}', '2D_rendering')
+        room_ids = [p for p in os.listdir(self.scene_path)]
+        self.depth_paths = [
+            os.path.join(*[
+                self.scene_path, room_id, 'panorama', self.resolution,
+                'depth.png'
+            ]) for room_id in room_ids
+        ]
+
+        self.camera_paths = [
+            os.path.join(
+                *[self.scene_path, room_id, 'panorama', 'camera_xyz.txt'])
+            for room_id in room_ids
+        ]
+
+        self.rgb_paths = [
+            os.path.join(*[
+                self.scene_path, room_id, 'panorama', self.resolution,
+                'rgb_coldlight.png'
+            ]) for room_id in room_ids
+        ]
+
+        self.seman_paths = [
+            os.path.join(*[
+                self.scene_path, room_id, 'panorama', self.resolution,
+                'semantic.png'
+            ]) for room_id in room_ids
+        ]
+
+        self.inst_paths = [
+            os.path.join(*[
+                self.path_to_bb, f'scene_{self.scene_id}', '2D_rendering',
+                room_id, f'panorama/{self.resolution}', 'instance.png'
+            ]) for room_id in room_ids
+        ]
+
+        self.camera_centers = self.read_camera_center()
+        self.point_cloud = self.generate_point_cloud()
+
+    def read_camera_center(self):
+        """Read the camera centers.
+        This method gets information about camera centers.
+        
+        Returns:
+            List[np.ndarray]: camera centers for every room in the scene.
+        """
+        camera_centers = []
+        for i in range(len(self.camera_paths)):
+            if os.path.exists(self.camera_paths[i]):
+                with open(self.camera_paths[i], 'r') as f:
+                    line = f.readline()
+                center = list(map(float, line.strip().split(' ')))
+                camera_centers.append(
+                    np.asarray([center[0], center[1], center[2]]))
+
+        return camera_centers
+
+    def generate_point_cloud(self):
+        """Generate data.
+        This method gets point_clouds, semantics, instances
+        and bboxs for every room in the scene.
+
+        Returns:
+            dict: Processed point_clouds, semantics, instances, bboxs.
+        """
+        points = {}
+        labels = []
+        point_clouds = []
+        insts = []
+        bboxs = []
+        for i in range(len(self.depth_paths)):
+            try:
+                depth = cv2.imread(self.depth_paths[i], cv2.IMREAD_ANYDEPTH)
+                # ------------------- #
+                H, W = depth.shape
+                x_tick = 180.0 / H
+                y_tick = 360.0 / W
+                x = np.arange(H)
+                y = np.arange(W)
+                x = np.broadcast_to(x.reshape(-1, 1), (H, W))
+                y = np.broadcast_to(y.reshape(-1), (H, W))
+                alpha = 90 - (x * x_tick)
+                beta = y * y_tick - 180
+                xy_offset = depth * np.cos(np.deg2rad(alpha))
+                x_offset = xy_offset * np.sin(
+                    np.deg2rad(beta)) + self.camera_centers[i][0]
+                y_offset = xy_offset * np.cos(
+                    np.deg2rad(beta)) + self.camera_centers[i][1]
+                z_offset = depth * np.sin(
+                    np.deg2rad(alpha)) + self.camera_centers[i][2]
+                temp = np.hstack([
+                    x_offset.reshape(-1, 1),
+                    y_offset.reshape(-1, 1),
+                    z_offset.reshape(-1, 1)
+                ]) / 1000
+                # ------------------- #
+                # Read RGB image
+                rgb_img = cv2.imread(self.rgb_paths[i]).reshape(-1, 3)
+            
+                # ------------------- #
+                # Read semantic image
+                semantic = cv2.imread(self.seman_paths[i])
+                semantic = cv2.cvtColor(semantic, cv2.COLOR_BGR2RGB)
+                # ------------------- #
+                # Read instance image
+                inst = cv2.imread(self.inst_paths[i], cv2.IMREAD_UNCHANGED)
+            except:
+                continue
+
+            semantic = semantic.reshape(-1, 3)
+            cur_labels = mapping[
+                semantic[:, 0], semantic[:, 1], semantic[:, 2]].copy()
+            inst = inst.reshape(-1)
+            inst = np.where(inst == 65535, -1, inst)
+
+            if np.unique(inst)[0] == -1:
+                instance_unique = np.unique(inst)[1:]
+            else:
+                instance_unique = np.unique(inst)
+            if temp.shape[0] != inst.shape[0]:
+                print(
+                    f'Error - point_cloud shape {temp.shape[0]} '
+                    f'!= inst.shape {inst.shape[0]}')
+                continue
+
+            for inst_id in instance_unique:
+                cur_labels[inst == inst_id] = \
+                    np.unique(cur_labels[inst == inst_id])[0]
+
+            inst[cur_labels == 1] = -1
+            inst[cur_labels == 2] = -1
+
+            if np.unique(inst)[0] == -1:
+                instance_unique = np.unique(inst)[1:]
+            else:
+                instance_unique = np.unique(inst)
+            
+            for inst_id in instance_unique:
+                assert len(np.unique(cur_labels[inst == inst_id])) == 1
+
+            if len(inst[cur_labels == 1]) != 0:
+                assert len(np.unique(inst[cur_labels == 1])) == 1
+                assert np.unique(inst[cur_labels == 1])[0] == -1
+            
+            if len(inst[cur_labels == 2]) != 0:
+                assert len(np.unique(inst[cur_labels == 2])) == 1
+                assert np.unique(inst[cur_labels == 2])[0] == -1
+
+            temp_bb = []
+            for inst_id in instance_unique:
+                indexes = inst == inst_id
+                current_points = temp[indexes]
+                current_points_min = current_points.min(0)
+                current_points_max = current_points.max(0)
+                current_points_avg = (
+                    current_points_max + current_points_min) / 2
+                lwh = (current_points_max - current_points_avg).copy()
+                vals, occurs = np.unique(
+                    cur_labels[indexes], return_counts=True)
+                bbox_labels = vals[occurs.argmax()].copy()
+                temp_bb.append(
+                    np.hstack([current_points_avg, lwh, bbox_labels]))
+
+            insts.append(inst.copy())
+            labels.append(cur_labels)
+            point_clouds.append(np.hstack([temp, rgb_img]).copy())
+            bboxs.append(temp_bb)
+
+        points['labels'] = labels
+        points['point_clouds'] = point_clouds
+        points['instances'] = insts
+        points['bboxs'] = bboxs
+
+        return points
diff --git a/oneformer3d/__init__.py b/oneformer3d/__init__.py
new file mode 100644
index 0000000..a823481
--- /dev/null
+++ b/oneformer3d/__init__.py
@@ -0,0 +1,23 @@
+from .oneformer3d import (
+    ScanNetOneFormer3D, ScanNet200OneFormer3D, S3DISOneFormer3D,
+    InstanceOnlyOneFormer3D)
+from .spconv_unet import SpConvUNet
+from .mink_unet import Res16UNet34C
+from .query_decoder import ScanNetQueryDecoder, QueryDecoder
+from .unified_criterion import (
+    ScanNetUnifiedCriterion, S3DISUnifiedCriterion)
+from .semantic_criterion import (
+    ScanNetSemanticCriterion, S3DISSemanticCriterion)
+from .instance_criterion import (
+    InstanceCriterion, QueryClassificationCost, MaskBCECost, MaskDiceCost,
+    HungarianMatcher, SparseMatcher, OneDataCriterion)
+from .loading import LoadAnnotations3D_, NormalizePointsColor_
+from .formatting import Pack3DDetInputs_
+from .transforms_3d import (
+    ElasticTransfrom, AddSuperPointAnnotations, SwapChairAndFloor, PointSample_)
+from .data_preprocessor import Det3DDataPreprocessor_
+from .unified_metric import UnifiedSegMetric
+from .scannet_dataset import ScanNetSegDataset_, ScanNet200SegDataset_
+from .s3dis_dataset import S3DISSegDataset_
+from .structured3d_dataset import Structured3DSegDataset, ConcatDataset_
+from .structures import InstanceData_
diff --git a/oneformer3d/data_preprocessor.py b/oneformer3d/data_preprocessor.py
new file mode 100644
index 0000000..a8195b0
--- /dev/null
+++ b/oneformer3d/data_preprocessor.py
@@ -0,0 +1,78 @@
+# Copied from mmdet3d/models/data_preprocessors/data_preprocessor.py
+from mmdet3d.models.data_preprocessors.data_preprocessor import \
+    Det3DDataPreprocessor
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class Det3DDataPreprocessor_(Det3DDataPreprocessor):
+    """
+    We add only this 2 lines:
+    if 'elastic_coords' in inputs:
+        batch_inputs['elastic_coords'] = inputs['elastic_coords']
+    """
+    def simple_process(self, data, training=False):
+        """Perform normalization, padding and bgr2rgb conversion for img data
+        based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel`
+        is set to be True.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+                Defaults to False.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        if 'img' in data['inputs']:
+            batch_pad_shape = self._get_pad_shape(data)
+
+        data = self.collate_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        batch_inputs = dict()
+
+        if 'points' in inputs:
+            batch_inputs['points'] = inputs['points']
+
+            if self.voxel:
+                voxel_dict = self.voxelize(inputs['points'], data_samples)
+                batch_inputs['voxels'] = voxel_dict
+
+        if 'elastic_coords' in inputs:
+            batch_inputs['elastic_coords'] = inputs['elastic_coords']
+
+        if 'imgs' in inputs:
+            imgs = inputs['imgs']
+
+            if data_samples is not None:
+                # NOTE the batched image size information may be useful, e.g.
+                # in DETR, this is needed for the construction of masks, which
+                # is then used for the transformer_head.
+                batch_input_shape = tuple(imgs[0].size()[-2:])
+                for data_sample, pad_shape in zip(data_samples,
+                                                  batch_pad_shape):
+                    data_sample.set_metainfo({
+                        'batch_input_shape': batch_input_shape,
+                        'pad_shape': pad_shape
+                    })
+
+                if hasattr(self, 'boxtype2tensor') and self.boxtype2tensor:
+                    from mmdet.models.utils.misc import \
+                        samplelist_boxtype2tensor
+                    samplelist_boxtype2tensor(data_samples)
+                elif hasattr(self, 'boxlist2tensor') and self.boxlist2tensor:
+                    from mmdet.models.utils.misc import \
+                        samplelist_boxlist2tensor
+                    samplelist_boxlist2tensor(data_samples)
+                if self.pad_mask:
+                    self.pad_gt_masks(data_samples)
+
+                if self.pad_seg:
+                    self.pad_gt_sem_seg(data_samples)
+
+            if training and self.batch_augments is not None:
+                for batch_aug in self.batch_augments:
+                    imgs, data_samples = batch_aug(imgs, data_samples)
+            batch_inputs['imgs'] = imgs
+
+        return {'inputs': batch_inputs, 'data_samples': data_samples}
diff --git a/oneformer3d/evaluate_semantic_instance.py b/oneformer3d/evaluate_semantic_instance.py
new file mode 100644
index 0000000..4ad81e1
--- /dev/null
+++ b/oneformer3d/evaluate_semantic_instance.py
@@ -0,0 +1,368 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/3d_evaluation/evaluate_semantic_instance.py # noqa
+from copy import deepcopy
+
+import numpy as np
+
+from mmdet3d.evaluation.functional.scannet_utils import util_3d
+
+
+def evaluate_matches(matches, class_labels, options):
+    """Evaluate instance segmentation from matched gt and predicted instances
+    for all scenes.
+
+    Args:
+        matches (dict): Contains gt2pred and pred2gt infos for every scene.
+        class_labels (tuple[str]): Class names.
+        options (dict): ScanNet evaluator options. See get_options.
+
+    Returns:
+        np.array: Average precision scores for all thresholds and categories.
+    """
+    overlaps = options['overlaps']
+    min_region_sizes = [options['min_region_sizes'][0]]
+    dist_threshes = [options['distance_threshes'][0]]
+    dist_confs = [options['distance_confs'][0]]
+
+    # results: class x overlap
+    ap = np.zeros((len(dist_threshes), len(class_labels), len(overlaps)),
+                  float)
+    pr_rc = np.zeros((2, len(class_labels), len(overlaps)),
+                  float)
+    for di, (min_region_size, distance_thresh, distance_conf) in enumerate(
+            zip(min_region_sizes, dist_threshes, dist_confs)):
+        for oi, overlap_th in enumerate(overlaps):
+            pred_visited = {}
+            for m in matches:
+                for label_name in class_labels:
+                    for p in matches[m]['pred'][label_name]:
+                        if 'filename' in p:
+                            pred_visited[p['filename']] = False
+            for li, label_name in enumerate(class_labels):
+                y_true = np.empty(0)
+                y_score = np.empty(0)
+                hard_false_negatives = 0
+                has_gt = False
+                has_pred = False
+                for m in matches:
+                    pred_instances = matches[m]['pred'][label_name]
+                    gt_instances = matches[m]['gt'][label_name]
+                    # filter groups in ground truth
+                    gt_instances = [
+                        gt for gt in gt_instances
+                        if gt['vert_count'] >=
+                        min_region_size and gt['med_dist'] <= distance_thresh
+                        and gt['dist_conf'] >= distance_conf
+                    ]
+                    if gt_instances:
+                        has_gt = True
+                    if pred_instances:
+                        has_pred = True
+
+                    cur_true = np.ones(len(gt_instances))
+                    cur_score = np.ones(len(gt_instances)) * (-float('inf'))
+                    cur_match = np.zeros(len(gt_instances), dtype=bool)
+                    # collect matches
+                    for (gti, gt) in enumerate(gt_instances):
+                        found_match = False
+                        for pred in gt['matched_pred']:
+                            # greedy assignments
+                            if pred_visited[pred['filename']]:
+                                continue
+                            overlap = float(pred['intersection']) / (
+                                gt['vert_count'] + pred['vert_count'] -
+                                pred['intersection'])
+                            if overlap > overlap_th:
+                                confidence = pred['confidence']
+                                # if already have a prediction for this gt,
+                                # the prediction with the lower score is automatically a false positive # noqa
+                                if cur_match[gti]:
+                                    max_score = max(cur_score[gti], confidence)
+                                    min_score = min(cur_score[gti], confidence)
+                                    cur_score[gti] = max_score
+                                    # append false positive
+                                    cur_true = np.append(cur_true, 0)
+                                    cur_score = np.append(cur_score, min_score)
+                                    cur_match = np.append(cur_match, True)
+                                # otherwise set score
+                                else:
+                                    found_match = True
+                                    cur_match[gti] = True
+                                    cur_score[gti] = confidence
+                                    pred_visited[pred['filename']] = True
+                        if not found_match:
+                            hard_false_negatives += 1
+                    # remove non-matched ground truth instances
+                    cur_true = cur_true[cur_match]
+                    cur_score = cur_score[cur_match]
+
+                    # collect non-matched predictions as false positive
+                    for pred in pred_instances:
+                        found_gt = False
+                        for gt in pred['matched_gt']:
+                            overlap = float(gt['intersection']) / (
+                                gt['vert_count'] + pred['vert_count'] -
+                                gt['intersection'])
+                            if overlap > overlap_th:
+                                found_gt = True
+                                break
+                        if not found_gt:
+                            num_ignore = pred['void_intersection']
+                            for gt in pred['matched_gt']:
+                                # group?
+                                if gt['instance_id'] < 1000:
+                                    num_ignore += gt['intersection']
+                                # small ground truth instances
+                                if gt['vert_count'] < min_region_size or gt[
+                                        'med_dist'] > distance_thresh or gt[
+                                            'dist_conf'] < distance_conf:
+                                    num_ignore += gt['intersection']
+                            proportion_ignore = float(
+                                num_ignore) / pred['vert_count']
+                            # if not ignored append false positive
+                            if proportion_ignore <= overlap_th:
+                                cur_true = np.append(cur_true, 0)
+                                confidence = pred['confidence']
+                                cur_score = np.append(cur_score, confidence)
+
+                    # append to overall results
+                    y_true = np.append(y_true, cur_true)
+                    y_score = np.append(y_score, cur_score)
+
+                # compute average precision
+                if has_gt and has_pred:
+                    # compute precision recall curve first
+
+                    # sorting and cumsum
+                    score_arg_sort = np.argsort(y_score)
+                    y_score_sorted = y_score[score_arg_sort]
+                    y_true_sorted = y_true[score_arg_sort]
+                    y_true_sorted_cumsum = np.cumsum(y_true_sorted)
+
+                    # unique thresholds
+                    (thresholds, unique_indices) = np.unique(
+                        y_score_sorted, return_index=True)
+                    num_prec_recall = len(unique_indices) + 1
+
+                    # prepare precision recall
+                    num_examples = len(y_score_sorted)
+                    # follow https://github.com/ScanNet/ScanNet/pull/26 ? # noqa
+                    num_true_examples = y_true_sorted_cumsum[-1] if len(
+                        y_true_sorted_cumsum) > 0 else 0
+                    precision = np.zeros(num_prec_recall)
+                    recall = np.zeros(num_prec_recall)
+
+                    # deal with the first point
+                    y_true_sorted_cumsum = np.append(y_true_sorted_cumsum, 0)
+                    # deal with remaining
+                    for idx_res, idx_scores in enumerate(unique_indices):
+                        cumsum = y_true_sorted_cumsum[idx_scores - 1]
+                        tp = num_true_examples - cumsum
+                        fp = num_examples - idx_scores - tp
+                        fn = cumsum + hard_false_negatives
+                        p = float(tp) / (tp + fp)
+                        r = float(tp) / (tp + fn)
+                        precision[idx_res] = p
+                        recall[idx_res] = r
+
+                    # first point in curve is artificial
+                    precision[-1] = 1.
+                    recall[-1] = 0.
+
+                    #compute optimal precision and recall, based on f1_score
+                    f1_score = 2 * precision * recall / (precision + recall + 0.0001)
+                    f1_argmax = f1_score.argmax()
+                    best_pr = precision[f1_argmax]
+                    best_rc = recall[f1_argmax]
+
+                    # compute average of precision-recall curve
+                    recall_for_conv = np.copy(recall)
+                    recall_for_conv = np.append(recall_for_conv[0],
+                                                recall_for_conv)
+                    recall_for_conv = np.append(recall_for_conv, 0.)
+
+                    stepWidths = np.convolve(recall_for_conv, [-0.5, 0, 0.5],
+                                             'valid')
+                    # integrate is now simply a dot product
+                    ap_current = np.dot(precision, stepWidths)
+
+                elif has_gt:
+                    ap_current = 0.0
+                    best_pr = 0
+                    best_rc = 0
+                else:
+                    ap_current = float('nan')
+                    best_pr = float('nan')
+                    best_rc = float('nan')
+                ap[di, li, oi] = ap_current
+                pr_rc[0, li, oi] = best_pr
+                pr_rc[1, li, oi] = best_rc
+
+    return ap, pr_rc
+
+
+def compute_averages(aps, pr_rc, options, class_labels):
+    """Averages AP scores for all categories.
+
+    Args:
+        aps (np.array): AP scores for all thresholds and categories.
+        options (dict): ScanNet evaluator options. See get_options.
+        class_labels (tuple[str]): Class names.
+
+    Returns:
+        dict: Overall and per-category AP scores.
+    """
+    d_inf = 0
+    o50 = np.where(np.isclose(options['overlaps'], 0.5))
+    o25 = np.where(np.isclose(options['overlaps'], 0.25))
+    o_all_but25 = np.where(
+        np.logical_not(np.isclose(options['overlaps'], 0.25)))
+    avg_dict = {}
+    avg_dict['all_ap'] = np.nanmean(aps[d_inf, :, o_all_but25])
+    avg_dict['all_ap_50%'] = np.nanmean(aps[d_inf, :, o50])
+    avg_dict['all_ap_25%'] = np.nanmean(aps[d_inf, :, o25])
+    avg_dict['all_prec_50%'] = np.nanmean(pr_rc[0, :, o50])
+    avg_dict['all_rec_50%'] = np.nanmean(pr_rc[1, :, o50])
+    avg_dict['classes'] = {}
+    for (li, label_name) in enumerate(class_labels):
+        avg_dict['classes'][label_name] = {}
+        avg_dict['classes'][label_name]['ap'] = np.average(aps[d_inf, li,
+                                                               o_all_but25])
+        avg_dict['classes'][label_name]['ap50%'] = np.average(aps[d_inf, li,
+                                                                  o50])
+        avg_dict['classes'][label_name]['ap25%'] = np.average(aps[d_inf, li,
+                                                                  o25])
+        avg_dict['classes'][label_name]['prec50%'] = np.average(pr_rc[0, li,
+                                                                  o50])
+        avg_dict['classes'][label_name]['rec50%'] = np.average(pr_rc[1, li,
+                                                                  o50])
+    return avg_dict
+
+
+def assign_instances_for_scan(pred_info, gt_ids, options, valid_class_ids,
+                              class_labels, id_to_label):
+    """Assign gt and predicted instances for a single scene.
+
+    Args:
+        pred_info (dict): Predicted masks, labels and scores.
+        gt_ids (np.array): Ground truth instance masks.
+        options (dict): ScanNet evaluator options. See get_options.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Class names.
+        id_to_label (dict[int, str]): Mapping of valid class id to class label.
+
+    Returns:
+        dict: Per class assigned gt to predicted instances.
+        dict: Per class assigned predicted to gt instances.
+    """
+    # get gt instances
+    gt_instances = util_3d.get_instances(gt_ids, valid_class_ids, class_labels,
+                                         id_to_label)
+    # associate
+    gt2pred = deepcopy(gt_instances)
+    for label in gt2pred:
+        for gt in gt2pred[label]:
+            gt['matched_pred'] = []
+    pred2gt = {}
+    for label in class_labels:
+        pred2gt[label] = []
+    num_pred_instances = 0
+    # mask of void labels in the ground truth
+    bool_void = np.logical_not(np.in1d(gt_ids // 1000, valid_class_ids))
+    # go through all prediction masks
+    for pred_mask_file in pred_info:
+        label_id = int(pred_info[pred_mask_file]['label_id'])
+        conf = pred_info[pred_mask_file]['conf']
+        if not label_id in id_to_label:  # noqa E713
+            continue
+        label_name = id_to_label[label_id]
+        # read the mask
+        pred_mask = pred_info[pred_mask_file]['mask']
+        if len(pred_mask) != len(gt_ids):
+            raise ValueError('len(pred_mask) != len(gt_ids)')
+        # convert to binary
+        pred_mask = np.not_equal(pred_mask, 0)
+        num = np.count_nonzero(pred_mask)
+        if num < options['min_region_sizes'][0]:
+            continue  # skip if empty
+
+        pred_instance = {}
+        pred_instance['filename'] = pred_mask_file
+        pred_instance['pred_id'] = num_pred_instances
+        pred_instance['label_id'] = label_id
+        pred_instance['vert_count'] = num
+        pred_instance['confidence'] = conf
+        pred_instance['void_intersection'] = np.count_nonzero(
+            np.logical_and(bool_void, pred_mask))
+
+        # matched gt instances
+        matched_gt = []
+        # go through all gt instances with matching label
+        for (gt_num, gt_inst) in enumerate(gt2pred[label_name]):
+            intersection = np.count_nonzero(
+                np.logical_and(gt_ids == gt_inst['instance_id'], pred_mask))
+            if intersection > 0:
+                gt_copy = gt_inst.copy()
+                pred_copy = pred_instance.copy()
+                gt_copy['intersection'] = intersection
+                pred_copy['intersection'] = intersection
+                matched_gt.append(gt_copy)
+                gt2pred[label_name][gt_num]['matched_pred'].append(pred_copy)
+        pred_instance['matched_gt'] = matched_gt
+        num_pred_instances += 1
+        pred2gt[label_name].append(pred_instance)
+
+    return gt2pred, pred2gt
+
+
+def scannet_eval(preds, gts, options, valid_class_ids, class_labels,
+                 id_to_label):
+    """Evaluate instance segmentation in ScanNet protocol.
+
+    Args:
+        preds (list[dict]): Per scene predictions of mask, label and
+            confidence.
+        gts (list[np.array]): Per scene ground truth instance masks.
+        options (dict): ScanNet evaluator options. See get_options.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Class names.
+        id_to_label (dict[int, str]): Mapping of valid class id to class label.
+
+    Returns:
+        dict: Overall and per-category AP scores.
+    """
+    options = get_options(options)
+    matches = {}
+    for i, (pred, gt) in enumerate(zip(preds, gts)):
+        matches_key = i
+        # assign gt to predictions
+        gt2pred, pred2gt = assign_instances_for_scan(pred, gt, options,
+                                                     valid_class_ids,
+                                                     class_labels, id_to_label)
+        matches[matches_key] = {}
+        matches[matches_key]['gt'] = gt2pred
+        matches[matches_key]['pred'] = pred2gt
+
+    ap_scores, pr_rc = evaluate_matches(matches, class_labels, options)
+    avgs = compute_averages(ap_scores, pr_rc, options, class_labels)
+    return avgs
+
+
+def get_options(options=None):
+    """Set ScanNet evaluator options.
+
+    Args:
+        options (dict, optional): Not default options. Default: None.
+
+    Returns:
+        dict: Updated options with all 4 keys.
+    """
+    assert options is None or isinstance(options, dict)
+    _options = dict(
+        overlaps=np.append(np.arange(0.5, 0.95, 0.05), 0.25),
+        min_region_sizes=np.array([100]),
+        distance_threshes=np.array([float('inf')]),
+        distance_confs=np.array([-float('inf')]))
+    if options is not None:
+        _options.update(options)
+    return _options
diff --git a/oneformer3d/formatting.py b/oneformer3d/formatting.py
new file mode 100644
index 0000000..cdf00f2
--- /dev/null
+++ b/oneformer3d/formatting.py
@@ -0,0 +1,142 @@
+# Adapted from mmdet3d/datasets/transforms/formating.py
+import numpy as np
+from .structures import InstanceData_
+from mmdet3d.datasets.transforms import Pack3DDetInputs
+from mmdet3d.datasets.transforms.formating import to_tensor
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures import BaseInstance3DBoxes, Det3DDataSample, PointData
+from mmdet3d.structures.points import BasePoints
+
+
+@TRANSFORMS.register_module()
+class Pack3DDetInputs_(Pack3DDetInputs):
+    """Just add elastic_coords, sp_pts_mask, and gt_sp_masks.
+    """
+    INPUTS_KEYS = ['points', 'img', 'elastic_coords']
+    SEG_KEYS = [
+        'gt_seg_map',
+        'pts_instance_mask',
+        'pts_semantic_mask',
+        'gt_semantic_seg',
+        'sp_pts_mask',
+    ]
+    INSTANCEDATA_3D_KEYS = [
+        'gt_bboxes_3d', 'gt_labels_3d', 'attr_labels', 'depths', 'centers_2d',
+        'gt_sp_masks'
+    ]
+
+    def pack_single_results(self, results: dict) -> dict:
+        """Method to pack the single input data. when the value in this dict is
+        a list, it usually is in Augmentations Testing.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict: A dict contains
+
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+
+                - points
+                - img
+
+            - 'data_samples' (:obj:`Det3DDataSample`): The annotation info
+              of the sample.
+        """
+        # Format 3D data
+        if 'points' in results:
+            if isinstance(results['points'], BasePoints):
+                results['points'] = results['points'].tensor
+
+        if 'img' in results:
+            if isinstance(results['img'], list):
+                # process multiple imgs in single frame
+                imgs = np.stack(results['img'], axis=0)
+                if imgs.flags.c_contiguous:
+                    imgs = to_tensor(imgs).permute(0, 3, 1, 2).contiguous()
+                else:
+                    imgs = to_tensor(
+                        np.ascontiguousarray(imgs.transpose(0, 3, 1, 2)))
+                results['img'] = imgs
+            else:
+                img = results['img']
+                if len(img.shape) < 3:
+                    img = np.expand_dims(img, -1)
+                # To improve the computational speed by by 3-5 times, apply:
+                # `torch.permute()` rather than `np.transpose()`.
+                # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+                # for more details
+                if img.flags.c_contiguous:
+                    img = to_tensor(img).permute(2, 0, 1).contiguous()
+                else:
+                    img = to_tensor(
+                        np.ascontiguousarray(img.transpose(2, 0, 1)))
+                results['img'] = img
+
+        for key in [
+                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
+                'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'sp_pts_mask', 'gt_sp_masks',
+                'elastic_coords', 'centers_2d', 'depths', 'gt_labels_3d'
+        ]:
+            if key not in results:
+                continue
+            if isinstance(results[key], list):
+                results[key] = [to_tensor(res) for res in results[key]]
+            else:
+                results[key] = to_tensor(results[key])
+        if 'gt_bboxes_3d' in results:
+            if not isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
+                results['gt_bboxes_3d'] = to_tensor(results['gt_bboxes_3d'])
+
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = to_tensor(
+                results['gt_semantic_seg'][None])
+        if 'gt_seg_map' in results:
+            results['gt_seg_map'] = results['gt_seg_map'][None, ...]
+
+        data_sample = Det3DDataSample()
+        gt_instances_3d = InstanceData_()
+        gt_instances = InstanceData_()
+        gt_pts_seg = PointData()
+
+        img_metas = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_metas[key] = results[key]
+        data_sample.set_metainfo(img_metas)
+
+        inputs = {}
+        for key in self.keys:
+            if key in results:
+                if key in self.INPUTS_KEYS:
+                    inputs[key] = results[key]
+                elif key in self.INSTANCEDATA_3D_KEYS:
+                    gt_instances_3d[self._remove_prefix(key)] = results[key]
+                elif key in self.INSTANCEDATA_2D_KEYS:
+                    if key == 'gt_bboxes_labels':
+                        gt_instances['labels'] = results[key]
+                    else:
+                        gt_instances[self._remove_prefix(key)] = results[key]
+                elif key in self.SEG_KEYS:
+                    gt_pts_seg[self._remove_prefix(key)] = results[key]
+                else:
+                    raise NotImplementedError(f'Please modified '
+                                              f'`Pack3DDetInputs` '
+                                              f'to put {key} to '
+                                              f'corresponding field')
+
+        data_sample.gt_instances_3d = gt_instances_3d
+        data_sample.gt_instances = gt_instances
+        data_sample.gt_pts_seg = gt_pts_seg
+        if 'eval_ann_info' in results:
+            data_sample.eval_ann_info = results['eval_ann_info']
+        else:
+            data_sample.eval_ann_info = None
+
+        packed_results = dict()
+        packed_results['data_samples'] = data_sample
+        packed_results['inputs'] = inputs
+
+        return packed_results
diff --git a/oneformer3d/instance_criterion.py b/oneformer3d/instance_criterion.py
new file mode 100644
index 0000000..61e1797
--- /dev/null
+++ b/oneformer3d/instance_criterion.py
@@ -0,0 +1,724 @@
+import torch
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+
+from .structures import InstanceData_
+from mmdet3d.registry import MODELS, TASK_UTILS
+
+
+def batch_sigmoid_bce_loss(inputs, targets):
+    """Sigmoid BCE loss.
+
+    Args:
+        inputs: of shape (n_queries, n_points).
+        targets: of shape (n_gts, n_points).
+    
+    Returns:
+        Tensor: Loss of shape (n_queries, n_gts).
+    """
+    pos = F.binary_cross_entropy_with_logits(
+        inputs, torch.ones_like(inputs), reduction='none')
+    neg = F.binary_cross_entropy_with_logits(
+        inputs, torch.zeros_like(inputs), reduction='none')
+
+    pos_loss = torch.einsum('nc,mc->nm', pos, targets)
+    neg_loss = torch.einsum('nc,mc->nm', neg, (1 - targets))
+    return (pos_loss + neg_loss) / inputs.shape[1]
+
+
+def batch_dice_loss(inputs, targets):
+    """Dice loss.
+
+    Args:
+        inputs: of shape (n_queries, n_points).
+        targets: of shape (n_gts, n_points).
+    
+    Returns:
+        Tensor: Loss of shape (n_queries, n_gts).
+    """
+    inputs = inputs.sigmoid()
+    numerator = 2 * torch.einsum('nc,mc->nm', inputs, targets)
+    denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+
+
+def get_iou(inputs, targets):
+    """IoU for to equal shape masks.
+
+    Args:
+        inputs (Tensor): of shape (n_gts, n_points).
+        targets (Tensor): of shape (n_gts, n_points).
+    
+    Returns:
+        Tensor: IoU of shape (n_gts,).
+    """
+    inputs = inputs.sigmoid()
+    binarized_inputs = (inputs >= 0.5).float()
+    targets = (targets > 0.5).float()
+    intersection = (binarized_inputs * targets).sum(-1)
+    union = targets.sum(-1) + binarized_inputs.sum(-1) - intersection
+    score = intersection / (union + 1e-6)
+    return score
+
+
+def dice_loss(inputs, targets):
+    """Compute the DICE loss, similar to generalized IOU for masks.
+
+    Args:
+        inputs (Tensor): A float tensor of arbitrary shape.
+            The predictions for each example.
+        targets (Tensor): A float tensor with the same shape as inputs.
+            Stores the binary classification label for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+    
+    Returns:
+        Tensor: loss value.
+    """
+    inputs = inputs.sigmoid()
+    numerator = 2 * (inputs * targets).sum(-1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.mean()
+
+
+@MODELS.register_module()
+class InstanceCriterion:
+    """Instance criterion.
+
+    Args:
+        matcher (Callable): Class for matching queries with gt.
+        loss_weight (List[float]): 4 weights for query classification,
+            mask bce, mask dice, and score losses.
+        non_object_weight (float): no_object weight for query classification.
+        num_classes (int): number of classes.
+        fix_dice_loss_weight (bool): Whether to fix dice loss for
+            batch_size != 4.
+        iter_matcher (bool): Whether to use separate matcher for
+            each decoder layer.
+        fix_mean_loss (bool): Whether to use .mean() instead of .sum()
+            for mask losses.
+
+    """
+
+    def __init__(self, matcher, loss_weight, non_object_weight, num_classes,
+                 fix_dice_loss_weight, iter_matcher, fix_mean_loss=False):
+        self.matcher = TASK_UTILS.build(matcher)
+        class_weight = [1] * num_classes + [non_object_weight]
+        self.class_weight = class_weight
+        self.loss_weight = loss_weight
+        self.num_classes = num_classes
+        self.fix_dice_loss_weight = fix_dice_loss_weight
+        self.iter_matcher = iter_matcher
+        self.fix_mean_loss = fix_mean_loss
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def get_layer_loss(self, aux_outputs, insts, indices=None):
+        """Per layer auxiliary loss.
+
+        Args:
+            aux_outputs (Dict):
+                List `cls_preds` of shape len batch_size, each of shape
+                    (n_queries, n_classes + 1)
+                List `scores` of len batch_size each of shape (n_queries, 1)
+                List `masks` of len batch_size each of shape
+                    (n_queries, n_points)
+            insts (List):
+                Ground truth of len batch_size, each InstanceData_ with
+                    `sp_masks` of shape (n_gts_i, n_points_i)
+                    `labels_3d` of shape (n_gts_i,)
+                    `query_masks` of shape (n_gts_i, n_queries_i).
+        
+        Returns:
+            Tensor: loss value.
+        """
+        cls_preds = aux_outputs['cls_preds']
+        pred_scores = aux_outputs['scores']
+        pred_masks = aux_outputs['masks']
+
+        if indices is None:
+            indices = []
+            for i in range(len(insts)):
+                pred_instances = InstanceData_(
+                    scores=cls_preds[i],
+                    masks=pred_masks[i])
+                gt_instances = InstanceData_(
+                    labels=insts[i].labels_3d,
+                    masks=insts[i].sp_masks)
+                if insts[i].get('query_masks') is not None:
+                    gt_instances.query_masks = insts[i].query_masks
+                indices.append(self.matcher(pred_instances, gt_instances))
+
+        cls_losses = []
+        for cls_pred, inst, (idx_q, idx_gt) in zip(cls_preds, insts, indices):
+            n_classes = cls_pred.shape[1] - 1
+            cls_target = cls_pred.new_full(
+                (len(cls_pred),), n_classes, dtype=torch.long)
+            cls_target[idx_q] = inst.labels_3d[idx_gt]
+            cls_losses.append(F.cross_entropy(
+                cls_pred, cls_target, cls_pred.new_tensor(self.class_weight)))
+        cls_loss = torch.mean(torch.stack(cls_losses))
+
+        # 3 other losses
+        score_losses, mask_bce_losses, mask_dice_losses = [], [], []
+        for mask, score, inst, (idx_q, idx_gt) in zip(pred_masks, pred_scores,
+                                                      insts, indices):
+            if len(inst) == 0:
+                continue
+
+            pred_mask = mask[idx_q]
+            tgt_mask = inst.sp_masks[idx_gt]
+            mask_bce_losses.append(F.binary_cross_entropy_with_logits(
+            pred_mask, tgt_mask.float()))
+            mask_dice_losses.append(dice_loss(pred_mask, tgt_mask.float()))
+            
+            # check if skip objectness loss
+            if score is None:
+                continue
+
+            pred_score = score[idx_q]
+            with torch.no_grad():
+                tgt_score = get_iou(pred_mask, tgt_mask).unsqueeze(1)
+
+            filter_id, _ = torch.where(tgt_score > 0.5)
+            if filter_id.numel():
+                tgt_score = tgt_score[filter_id]
+                pred_score = pred_score[filter_id]
+                score_losses.append(F.mse_loss(pred_score, tgt_score))
+        # todo: actually .mean() should be better
+        if len(score_losses):
+            score_loss = torch.stack(score_losses).sum() / len(pred_masks)
+        else:
+            score_loss = 0
+
+        if len(mask_bce_losses):
+            mask_bce_loss = torch.stack(mask_bce_losses).sum() / len(pred_masks)
+            mask_dice_loss = torch.stack(mask_dice_losses).sum() / len(pred_masks)
+
+            if self.fix_dice_loss_weight:
+                mask_dice_loss = mask_dice_loss / len(pred_masks) * 4
+            
+            if self.fix_mean_loss:
+                mask_bce_loss  = mask_bce_loss * len(pred_masks) \
+                    / len(mask_bce_losses)
+                mask_dice_loss  = mask_dice_loss * len(pred_masks) \
+                    / len(mask_dice_losses)
+        else:
+            mask_bce_loss = 0
+            mask_dice_loss = 0
+
+        loss = (
+            self.loss_weight[0] * cls_loss +
+            self.loss_weight[1] * mask_bce_loss +
+            self.loss_weight[2] * mask_dice_loss +
+            self.loss_weight[3] * score_loss)
+
+        return loss
+
+    # todo: refactor pred to InstanceData_
+    def __call__(self, pred, insts):
+        """Loss main function.
+
+        Args:
+            pred (Dict):
+                List `cls_preds` of shape len batch_size, each of shape
+                    (n_queries, n_classes + 1)
+                List `scores` of len batch_size each of shape (n_queries, 1)
+                List `masks` of len batch_size each of shape
+                    (n_queries, n_points)
+                Dict `aux_preds` with list of cls_preds, scores, and masks.
+            insts (List):
+                Ground truth of len batch_size, each InstanceData_ with
+                    `sp_masks` of shape (n_gts_i, n_points_i)
+                    `labels_3d` of shape (n_gts_i,)
+                    `query_masks` of shape (n_gts_i, n_queries_i).
+        
+        Returns:
+            Dict: with instance loss value.
+        """
+        cls_preds = pred['cls_preds']
+        pred_scores = pred['scores']
+        pred_masks = pred['masks']
+
+        # match
+        indices = []
+        for i in range(len(insts)):
+            pred_instances = InstanceData_(
+                scores=cls_preds[i],
+                masks=pred_masks[i])
+            gt_instances = InstanceData_(
+                labels=insts[i].labels_3d,
+                masks=insts[i].sp_masks)
+            if insts[i].get('query_masks') is not None:
+                gt_instances.query_masks = insts[i].query_masks
+            indices.append(self.matcher(pred_instances, gt_instances))
+
+        # class loss
+        cls_losses = []
+        for cls_pred, inst, (idx_q, idx_gt) in zip(cls_preds, insts, indices):
+            n_classes = cls_pred.shape[1] - 1
+            cls_target = cls_pred.new_full(
+                (len(cls_pred),), n_classes, dtype=torch.long)
+            cls_target[idx_q] = inst.labels_3d[idx_gt]
+            cls_losses.append(F.cross_entropy(
+                cls_pred, cls_target, cls_pred.new_tensor(self.class_weight)))
+        cls_loss = torch.mean(torch.stack(cls_losses))
+
+        # 3 other losses
+        score_losses, mask_bce_losses, mask_dice_losses = [], [], []
+        for mask, score, inst, (idx_q, idx_gt) in zip(pred_masks, pred_scores,
+                                                      insts, indices):
+            if len(inst) == 0:
+                continue
+            pred_mask = mask[idx_q]
+            tgt_mask = inst.sp_masks[idx_gt]
+            mask_bce_losses.append(F.binary_cross_entropy_with_logits(
+                pred_mask, tgt_mask.float()))
+            mask_dice_losses.append(dice_loss(pred_mask, tgt_mask.float()))
+
+            # check if skip objectness loss
+            if score is None:
+                continue
+
+            pred_score = score[idx_q]
+            with torch.no_grad():
+                tgt_score = get_iou(pred_mask, tgt_mask).unsqueeze(1)
+
+            filter_id, _ = torch.where(tgt_score > 0.5)
+            if filter_id.numel():
+                tgt_score = tgt_score[filter_id]
+                pred_score = pred_score[filter_id]
+                score_losses.append(F.mse_loss(pred_score, tgt_score))
+        # todo: actually .mean() should be better
+        if len(score_losses):
+            score_loss = torch.stack(score_losses).sum() / len(pred_masks)
+        else:
+            score_loss = 0
+        
+        if len(mask_bce_losses):
+            mask_bce_loss = torch.stack(mask_bce_losses).sum() / len(pred_masks)
+            mask_dice_loss = torch.stack(mask_dice_losses).sum()
+
+            if self.fix_dice_loss_weight:
+                mask_dice_loss = mask_dice_loss / len(pred_masks) * 4
+            
+            if self.fix_mean_loss:
+                mask_bce_loss  = mask_bce_loss * len(pred_masks) \
+                    / len(mask_bce_losses)
+                mask_dice_loss  = mask_dice_loss * len(pred_masks) \
+                    / len(mask_dice_losses)
+        else:
+            mask_bce_loss = 0
+            mask_dice_loss = 0
+
+        loss = (
+            self.loss_weight[0] * cls_loss +
+            self.loss_weight[1] * mask_bce_loss +
+            self.loss_weight[2] * mask_dice_loss +
+            self.loss_weight[3] * score_loss)
+
+        if 'aux_outputs' in pred:
+            if self.iter_matcher:
+                indices = None
+            for i, aux_outputs in enumerate(pred['aux_outputs']):
+                loss += self.get_layer_loss(aux_outputs, insts, indices)
+
+        return {'inst_loss': loss}
+
+
+@TASK_UTILS.register_module()
+class QueryClassificationCost:
+    """Classification cost for queries.
+
+    Args:
+        weigth (float): Weight of the cost.
+    """
+    def __init__(self, weight):
+        self.weight = weight
+    
+    def __call__(self, pred_instances, gt_instances, **kwargs):
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData_`): Predicted instances which
+                must contain `scores` of shape (n_queries, n_classes + 1),
+            gt_instances (:obj:`InstanceData_`): Ground truth which must contain
+                `labels` of shape (n_gts,).
+
+        Returns:
+            Tensor: Cost of shape (n_queries, n_gts).
+        """
+        scores = pred_instances.scores.softmax(-1)
+        cost = -scores[:, gt_instances.labels]
+        return cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class MaskBCECost:
+    """Sigmoid BCE cost for masks.
+
+    Args:
+        weigth (float): Weight of the cost.
+    """
+    def __init__(self, weight):
+        self.weight = weight
+    
+    def __call__(self, pred_instances, gt_instances, **kwargs):
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData_`): Predicted instances which
+                mast contain `masks` of shape (n_queries, n_points).
+            gt_instances (:obj:`InstanceData_`): Ground truth which must contain
+                `labels` of shape (n_gts,), `masks` of shape (n_gts, n_points).
+        
+        Returns:
+            Tensor: Cost of shape (n_queries, n_gts).
+        """
+        cost = batch_sigmoid_bce_loss(
+            pred_instances.masks, gt_instances.masks.float())
+        return cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class MaskDiceCost:
+    """Dice cost for masks.
+
+    Args:
+        weigth (float): Weight of the cost.
+    """
+    def __init__(self, weight):
+        self.weight = weight
+    
+    def __call__(self, pred_instances, gt_instances, **kwargs):
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData_`): Predicted instances which
+                mast contain `masks` of shape (n_queries, n_points).
+            gt_instances (:obj:`InstanceData_`): Ground truth which must contain
+                `masks` of shape (n_gts, n_points).
+        
+        Returns:
+            Tensor: Cost of shape (n_queries, n_gts).
+        """
+        cost = batch_dice_loss(
+            pred_instances.masks, gt_instances.masks.float())
+        return cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class HungarianMatcher:
+    """Hungarian matcher.
+
+    Args:
+        costs (List[ConfigDict]): Cost functions.
+    """
+    def __init__(self, costs):
+        self.costs = []
+        for cost in costs:
+            self.costs.append(TASK_UTILS.build(cost))
+
+    @torch.no_grad()
+    def __call__(self, pred_instances, gt_instances, **kwargs):
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData_`): Predicted instances which
+                can contain `masks` of shape (n_queries, n_points), `scores`
+                of shape (n_queries, n_classes + 1),
+            gt_instances (:obj:`InstanceData_`): Ground truth which can contain
+                `labels` of shape (n_gts,), `masks` of shape (n_gts, n_points).
+
+        Returns:
+            Tuple:
+                - Tensor: Query ids of shape (n_matched,),
+                - Tensor: Object ids of shape (n_matched,).
+        """
+        labels = gt_instances.labels
+        n_gts = len(labels)
+        if n_gts == 0:
+            return labels.new_empty((0,)), labels.new_empty((0,))
+        
+        cost_values = []
+        for cost in self.costs:
+            cost_values.append(cost(pred_instances, gt_instances))
+        cost_value = torch.stack(cost_values).sum(dim=0)
+        query_ids, object_ids = linear_sum_assignment(cost_value.cpu().numpy())
+        return labels.new_tensor(query_ids), labels.new_tensor(object_ids)
+
+
+@TASK_UTILS.register_module()
+class SparseMatcher:
+    """Match only queries to their including objects.
+
+    Args:
+        costs (List[Callable]): Cost functions.
+        topk (int): Limit topk matches per query.
+    """
+
+    def __init__(self, costs, topk):
+        self.topk = topk
+        self.costs = []
+        self.inf = 1e8
+        for cost in costs:
+            self.costs.append(TASK_UTILS.build(cost))
+
+    @torch.no_grad()
+    def __call__(self, pred_instances, gt_instances, **kwargs):
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData_`): Predicted instances which
+                can contain `masks` of shape (n_queries, n_points), `scores`
+                of shape (n_queries, n_classes + 1),
+            gt_instances (:obj:`InstanceData_`): Ground truth which can contain
+                `labels` of shape (n_gts,), `masks` of shape (n_gts, n_points),
+                `query_masks` of shape (n_gts, n_queries).
+
+        Returns:
+            Tuple:
+                Tensor: Query ids of shape (n_matched,),
+                Tensor: Object ids of shape (n_matched,).
+        """
+        labels = gt_instances.labels
+        n_gts = len(labels)
+        if n_gts == 0:
+            return labels.new_empty((0,)), labels.new_empty((0,))
+        
+        cost_values = []
+        for cost in self.costs:
+            cost_values.append(cost(pred_instances, gt_instances))
+        # of shape (n_queries, n_gts)
+        cost_value = torch.stack(cost_values).sum(dim=0)
+        cost_value = torch.where(
+            gt_instances.query_masks.T, cost_value, self.inf)
+
+        values = torch.topk(
+            cost_value, self.topk + 1, dim=0, sorted=True,
+            largest=False).values[-1:, :]
+        ids = torch.argwhere(cost_value < values)
+        return ids[:, 0], ids[:, 1]
+
+
+@MODELS.register_module()
+class OneDataCriterion:
+    """Loss module for SPFormer.
+
+    Args:
+        matcher (Callable): Class for matching queries with gt.
+        loss_weight (List[float]): 4 weights for query classification,
+            mask bce, mask dice, and score losses.
+        non_object_weight (float): no_object weight for query classification.
+        num_classes_1dataset (int): Number of classes in the first dataset.
+        num_classes_2dataset (int): Number of classes in the second dataset.
+        fix_dice_loss_weight (bool): Whether to fix dice loss for
+            batch_size != 4.
+        iter_matcher (bool): Whether to use separate matcher for
+            each decoder layer.
+    """
+
+    def __init__(self, matcher, loss_weight, non_object_weight, 
+                 num_classes_1dataset, num_classes_2dataset,
+                 fix_dice_loss_weight, iter_matcher):
+        self.matcher = TASK_UTILS.build(matcher)
+        self.num_classes_1dataset = num_classes_1dataset
+        self.num_classes_2dataset = num_classes_2dataset
+        self.class_weight_1dataset = [1] * num_classes_1dataset + [non_object_weight]
+        self.class_weight_2dataset = [1] * num_classes_2dataset + [non_object_weight]
+        self.loss_weight = loss_weight
+        self.fix_dice_loss_weight = fix_dice_loss_weight
+        self.iter_matcher = iter_matcher
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def get_layer_loss(self, aux_outputs, insts, indices=None):
+        cls_preds = aux_outputs['cls_preds']
+        pred_scores = aux_outputs['scores']
+        pred_masks = aux_outputs['masks']
+
+        if indices is None:
+            indices = []
+            for i in range(len(insts)):
+                pred_instances = InstanceData_(
+                    scores=cls_preds[i],
+                    masks=pred_masks[i])
+                gt_instances = InstanceData_(
+                    labels=insts[i].labels_3d,
+                    masks=insts[i].sp_masks)
+                if insts[i].get('query_masks') is not None:
+                    gt_instances.query_masks = insts[i].query_masks
+                indices.append(self.matcher(pred_instances, gt_instances))
+
+        cls_losses = []
+        for cls_pred, inst, (idx_q, idx_gt) in zip(cls_preds, insts, indices):
+            n_classes = cls_pred.shape[1] - 1
+            cls_target = cls_pred.new_full(
+                (len(cls_pred),), n_classes, dtype=torch.long)
+            cls_target[idx_q] = inst.labels_3d[idx_gt]
+            if cls_pred.shape[1] == self.num_classes_1dataset + 1:
+                cls_losses.append(F.cross_entropy(
+                    cls_pred, cls_target,
+                    cls_pred.new_tensor(self.class_weight_1dataset)))
+            elif cls_pred.shape[1] == self.num_classes_2dataset + 1:
+                cls_losses.append(F.cross_entropy(
+                    cls_pred, cls_target,
+                    cls_pred.new_tensor(self.class_weight_2dataset)))
+            else:
+                raise RuntimeError(
+                    f'Invalid classes number {cls_pred.shape[1]}.')
+
+        cls_loss = torch.mean(torch.stack(cls_losses))
+
+        # 3 other losses
+        score_losses, mask_bce_losses, mask_dice_losses = [], [], []
+        for mask, score, inst, (idx_q, idx_gt) in zip(
+            pred_masks, pred_scores, insts, indices):
+            if len(inst) == 0:
+                continue
+
+            pred_mask = mask[idx_q]
+            tgt_mask = inst.sp_masks[idx_gt]
+            mask_bce_losses.append(F.binary_cross_entropy_with_logits(
+            pred_mask, tgt_mask.float()))
+            mask_dice_losses.append(dice_loss(pred_mask, tgt_mask.float()))
+            
+            # check if skip objectness loss
+            if score is None:
+                continue
+
+            pred_score = score[idx_q]
+            with torch.no_grad():
+                tgt_score = get_iou(pred_mask, tgt_mask).unsqueeze(1)
+
+            filter_id, _ = torch.where(tgt_score > 0.5)
+            if filter_id.numel():
+                tgt_score = tgt_score[filter_id]
+                pred_score = pred_score[filter_id]
+                score_losses.append(F.mse_loss(pred_score, tgt_score))
+        # todo: actually .mean() should be better
+        if len(score_losses):
+            score_loss = torch.stack(score_losses).sum() / len(pred_masks)
+        else:
+            score_loss = 0
+        mask_bce_loss = torch.stack(mask_bce_losses).sum() / len(pred_masks)
+        mask_dice_loss = torch.stack(mask_dice_losses).sum() / len(pred_masks)
+
+        loss = (
+            self.loss_weight[0] * cls_loss +
+            self.loss_weight[1] * mask_bce_loss +
+            self.loss_weight[2] * mask_dice_loss +
+            self.loss_weight[3] * score_loss)
+
+        return loss
+
+    # todo: refactor pred to InstanceData
+    def __call__(self, pred, insts):
+        """Loss main function.
+
+        Args:
+            pred (Dict):
+                List `cls_preds` of shape len batch_size, each of shape
+                    (n_gts, n_classes + 1);
+                List `scores` of len batch_size each of shape (n_gts, 1);
+                List `masks` of len batch_size each of shape (n_gts, n_points).
+                Dict `aux_preds` with list of cls_preds, scores, and masks.
+        """
+        cls_preds = pred['cls_preds']
+        pred_scores = pred['scores']
+        pred_masks = pred['masks']
+
+        # match
+        indices = []
+        for i in range(len(insts)):
+            pred_instances = InstanceData_(
+                scores=cls_preds[i],
+                masks=pred_masks[i])
+            gt_instances = InstanceData_(
+                labels=insts[i].labels_3d,
+                masks=insts[i].sp_masks)
+            if insts[i].get('query_masks') is not None:
+                gt_instances.query_masks = insts[i].query_masks
+            indices.append(self.matcher(pred_instances, gt_instances))
+
+        # class loss
+        cls_losses = []
+        for cls_pred, inst, (idx_q, idx_gt) in zip(cls_preds, insts, indices):
+            n_classes = cls_pred.shape[1] - 1
+            cls_target = cls_pred.new_full(
+                (len(cls_pred),), n_classes, dtype=torch.long)
+            cls_target[idx_q] = inst.labels_3d[idx_gt]            
+            if cls_pred.shape[1] == self.num_classes_1dataset + 1:
+                cls_losses.append(F.cross_entropy(
+                    cls_pred, cls_target,
+                    cls_pred.new_tensor(self.class_weight_1dataset)))
+            elif cls_pred.shape[1] == self.num_classes_2dataset + 1:
+                cls_losses.append(F.cross_entropy(
+                    cls_pred, cls_target,
+                    cls_pred.new_tensor(self.class_weight_2dataset)))
+            else:
+                raise RuntimeError(
+                    f'Invalid classes number {cls_pred.shape[1]}.')
+        
+        cls_loss = torch.mean(torch.stack(cls_losses))
+
+        # 3 other losses
+        score_losses, mask_bce_losses, mask_dice_losses = [], [], []
+        for mask, score, inst, (idx_q, idx_gt) in zip(pred_masks, pred_scores,
+                                                      insts, indices):
+            if len(inst) == 0:
+                continue
+            pred_mask = mask[idx_q]
+            tgt_mask = inst.sp_masks[idx_gt]
+            mask_bce_losses.append(F.binary_cross_entropy_with_logits(
+                pred_mask, tgt_mask.float()))
+            mask_dice_losses.append(dice_loss(pred_mask, tgt_mask.float()))
+
+            # check if skip objectness loss
+            if score is None:
+                continue
+
+            pred_score = score[idx_q]
+            with torch.no_grad():
+                tgt_score = get_iou(pred_mask, tgt_mask).unsqueeze(1)
+
+            filter_id, _ = torch.where(tgt_score > 0.5)
+            if filter_id.numel():
+                tgt_score = tgt_score[filter_id]
+                pred_score = pred_score[filter_id]
+                score_losses.append(F.mse_loss(pred_score, tgt_score))
+        # todo: actually .mean() should be better
+        if len(score_losses):
+            score_loss = torch.stack(score_losses).sum() / len(pred_masks)
+        else:
+            score_loss = 0
+        mask_bce_loss = torch.stack(mask_bce_losses).sum() / len(pred_masks)
+        mask_dice_loss = torch.stack(mask_dice_losses).sum()
+
+        if self.fix_dice_loss_weight:
+            mask_dice_loss = mask_dice_loss / len(pred_masks) * 4
+
+        loss = (
+            self.loss_weight[0] * cls_loss +
+            self.loss_weight[1] * mask_bce_loss +
+            self.loss_weight[2] * mask_dice_loss +
+            self.loss_weight[3] * score_loss)
+
+        if 'aux_outputs' in pred:
+            if self.iter_matcher:
+                indices = None
+            for i, aux_outputs in enumerate(pred['aux_outputs']):
+                loss += self.get_layer_loss(aux_outputs, insts, indices)
+
+        return {'inst_loss': loss}
diff --git a/oneformer3d/instance_seg_eval.py b/oneformer3d/instance_seg_eval.py
new file mode 100644
index 0000000..748a4e0
--- /dev/null
+++ b/oneformer3d/instance_seg_eval.py
@@ -0,0 +1,131 @@
+# Copied from mmdet3d/evaluation/functional/instance_seg_eval.py
+# We fix instance seg metric to accept boolean instance seg mask of
+# shape (n_points, n_instances) instead of integer mask of shape
+# (n_points, ).
+import numpy as np
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+from .evaluate_semantic_instance import scannet_eval
+
+
+# 1) We fix this line: info[file_name]['mask'] = mask[i].
+# 2) mask.max() + 1 in for is always equal to 2.
+#    We have changed it to mask.shape[0] for iterating over all masks.
+def aggregate_predictions(masks, labels, scores, valid_class_ids):
+    """Maps predictions to ScanNet evaluator format.
+
+    Args:
+        masks (list[torch.Tensor]): Per scene predicted instance masks.
+        labels (list[torch.Tensor]): Per scene predicted instance labels.
+        scores (list[torch.Tensor]): Per scene predicted instance scores.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+
+    Returns:
+        list[dict]: Per scene aggregated predictions.
+    """
+    infos = []
+    for id, (mask, label, score) in enumerate(zip(masks, labels, scores)):
+        mask = mask.numpy()
+        label = label.numpy()
+        score = score.numpy()
+        info = dict()
+        for i in range(mask.shape[0]):
+            # match pred_instance['filename'] from assign_instances_for_scan
+            file_name = f'{id}_{i}'
+            info[file_name] = dict()
+            info[file_name]['mask'] = mask[i]
+            info[file_name]['label_id'] = valid_class_ids[label[i]]
+            info[file_name]['conf'] = score[i]
+        infos.append(info)
+    return infos
+
+
+# For some reason the inputs are not torch.Tensor but np.ndarray.
+# We just remove torch -> numpy conversion here.
+def rename_gt(gt_semantic_masks, gt_instance_masks, valid_class_ids):
+    """Maps gt instance and semantic masks to instance masks for ScanNet
+    evaluator.
+
+    Args:
+        gt_semantic_masks (list[np.ndarray]): Per scene gt semantic masks.
+        gt_instance_masks (list[np.ndarray]): Per scene gt instance masks.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+
+    Returns:
+        list[np.array]: Per scene instance masks.
+    """
+    renamed_instance_masks = []
+    for semantic_mask, instance_mask in zip(gt_semantic_masks,
+                                            gt_instance_masks):
+        unique = np.unique(instance_mask)
+        assert len(unique) < 1000
+        for i in unique:
+            semantic_instance = semantic_mask[instance_mask == i]
+            semantic_unique = np.unique(semantic_instance)
+            assert len(semantic_unique) == 1
+            if semantic_unique[0] in valid_class_ids:
+                instance_mask[instance_mask ==
+                              i] = 1000 * semantic_unique[0] + i
+        renamed_instance_masks.append(instance_mask)
+    return renamed_instance_masks
+
+def instance_seg_eval(gt_semantic_masks,
+                      gt_instance_masks,
+                      pred_instance_masks,
+                      pred_instance_labels,
+                      pred_instance_scores,
+                      valid_class_ids,
+                      class_labels,
+                      options=None,
+                      logger=None):
+    """Instance Segmentation Evaluation.
+
+    Evaluate the result of the instance segmentation.
+
+    Args:
+        gt_semantic_masks (list[torch.Tensor]): Ground truth semantic masks.
+        gt_instance_masks (list[torch.Tensor]): Ground truth instance masks.
+        pred_instance_masks (list[torch.Tensor]): Predicted instance masks.
+        pred_instance_labels (list[torch.Tensor]): Predicted instance labels.
+        pred_instance_scores (list[torch.Tensor]): Predicted instance labels.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Names of valid categories.
+        options (dict, optional): Additional options. Keys may contain:
+            `overlaps`, `min_region_sizes`, `distance_threshes`,
+            `distance_confs`. Default: None.
+        logger (logging.Logger | str, optional): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+
+    Returns:
+        dict[str, float]: Dict of results.
+    """
+    assert len(valid_class_ids) == len(class_labels)
+    id_to_label = {
+        valid_class_ids[i]: class_labels[i]
+        for i in range(len(valid_class_ids))
+    }
+    preds = aggregate_predictions(
+        masks=pred_instance_masks,
+        labels=pred_instance_labels,
+        scores=pred_instance_scores,
+        valid_class_ids=valid_class_ids)
+    gts = rename_gt(gt_semantic_masks, gt_instance_masks, valid_class_ids)
+    metrics = scannet_eval(
+        preds=preds,
+        gts=gts,
+        options=options,
+        valid_class_ids=valid_class_ids,
+        class_labels=class_labels,
+        id_to_label=id_to_label)
+    header = ['classes', 'AP_0.25', 'AP_0.50', 'AP', 'Prec_0.50', 'Rec_0.50']
+    rows = []
+    for label, data in metrics['classes'].items():
+        aps = [data['ap25%'], data['ap50%'], data['ap'], data['prec50%'], data['rec50%']]
+        rows.append([label] + [f'{ap:.4f}' for ap in aps])
+    aps = metrics['all_ap_25%'], metrics['all_ap_50%'], metrics['all_ap'], metrics['all_prec_50%'], metrics['all_rec_50%']
+    footer = ['Overall'] + [f'{ap:.4f}' for ap in aps]
+    table = AsciiTable([header] + rows + [footer])
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+    return metrics
diff --git a/oneformer3d/instance_seg_metric.py b/oneformer3d/instance_seg_metric.py
new file mode 100644
index 0000000..eab50b5
--- /dev/null
+++ b/oneformer3d/instance_seg_metric.py
@@ -0,0 +1,106 @@
+# Copied from mmdet3d/evaluation/metrics/instance_seg_metric.py
+from mmengine.logging import MMLogger
+
+from mmdet3d.evaluation import InstanceSegMetric
+from mmdet3d.registry import METRICS
+from .instance_seg_eval import instance_seg_eval
+
+
+@METRICS.register_module()
+class SPInstanceSegMetric(InstanceSegMetric):
+    """The only difference with InstanceSegMetric is that following ScanNet
+    evaluator we accept instance prediction as a boolean tensor of shape
+    (n_points, n_instances) instead of integer tensor of shape (n_points, ).
+
+    For this purpose we only replace instance_seg_eval call.
+    """
+
+    def compute_metrics(self, results):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        self.classes = self.dataset_meta['classes']
+        self.valid_class_ids = self.dataset_meta['seg_valid_class_ids']
+
+        gt_semantic_masks = []
+        gt_instance_masks = []
+        pred_instance_masks = []
+        pred_instance_labels = []
+        pred_instance_scores = []
+
+        for eval_ann, single_pred_results in results:
+            gt_semantic_masks.append(eval_ann['pts_semantic_mask'])
+            gt_instance_masks.append(eval_ann['pts_instance_mask'])
+            pred_instance_masks.append(
+                single_pred_results['pts_instance_mask'])
+            pred_instance_labels.append(single_pred_results['instance_labels'])
+            pred_instance_scores.append(single_pred_results['instance_scores'])
+
+        ret_dict = instance_seg_eval(
+            gt_semantic_masks,
+            gt_instance_masks,
+            pred_instance_masks,
+            pred_instance_labels,
+            pred_instance_scores,
+            valid_class_ids=self.valid_class_ids,
+            class_labels=self.classes,
+            logger=logger)
+
+        return ret_dict
+
+
+@METRICS.register_module()
+class SPS3DISInstanceSegMetric(InstanceSegMetric):
+    """The only difference with SPInstanceSegMetric is that we shift
+    predicted and gt class labels with +1, as ScanNet evaluator ignores
+    gt label of 0.
+    """
+
+    def compute_metrics(self, results):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        self.classes = self.dataset_meta['classes']
+        self.valid_class_ids = self.dataset_meta['seg_valid_class_ids']
+
+        gt_semantic_masks = []
+        gt_instance_masks = []
+        pred_instance_masks = []
+        pred_instance_labels = []
+        pred_instance_scores = []
+
+        for eval_ann, single_pred_results in results:
+            gt_semantic_masks.append(eval_ann['pts_semantic_mask'] + 1)
+            gt_instance_masks.append(eval_ann['pts_instance_mask'])
+            pred_instance_masks.append(
+                single_pred_results['pts_instance_mask'])
+            pred_instance_labels.append(single_pred_results['instance_labels'])
+            pred_instance_scores.append(single_pred_results['instance_scores'])
+
+        ret_dict = instance_seg_eval(
+            gt_semantic_masks,
+            gt_instance_masks,
+            pred_instance_masks,
+            pred_instance_labels,
+            pred_instance_scores,
+            valid_class_ids=[class_id + 1 for class_id in self.valid_class_ids],
+            class_labels=self.classes,
+            logger=logger)
+
+        return ret_dict
diff --git a/oneformer3d/loading.py b/oneformer3d/loading.py
new file mode 100644
index 0000000..0ea0c12
--- /dev/null
+++ b/oneformer3d/loading.py
@@ -0,0 +1,106 @@
+# Adapted from mmdet3d/datasets/transforms/loading.py
+import mmengine
+import numpy as np
+
+from mmdet3d.datasets.transforms import LoadAnnotations3D
+from mmdet3d.datasets.transforms.loading import get
+from mmdet3d.datasets.transforms.loading import NormalizePointsColor
+from mmdet3d.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class LoadAnnotations3D_(LoadAnnotations3D):
+    """Just add super point mask loading.
+    
+    Args:
+        with_sp_mask_3d (bool): Whether to load super point maks. 
+    """
+
+    def __init__(self, with_sp_mask_3d, **kwargs):
+        self.with_sp_mask_3d = with_sp_mask_3d
+        super().__init__(**kwargs)
+
+    def _load_sp_pts_3d(self, results):
+        """Private function to load 3D superpoints mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D mask annotations.
+        """
+        sp_pts_mask_path = results['super_pts_path']
+
+        try:
+            mask_bytes = get(
+                sp_pts_mask_path, backend_args=self.backend_args)
+            # add .copy() to fix read-only bug
+            sp_pts_mask = np.frombuffer(
+                mask_bytes, dtype=np.int64).copy()
+        except ConnectionError:
+            mmengine.check_file_exist(sp_pts_mask_path)
+            sp_pts_mask = np.fromfile(
+                sp_pts_mask_path, dtype=np.int64)
+
+        results['sp_pts_mask'] = sp_pts_mask
+
+        # 'eval_ann_info' will be passed to evaluator
+        if 'eval_ann_info' in results:
+            results['eval_ann_info']['sp_pts_mask'] = sp_pts_mask
+            results['eval_ann_info']['lidar_idx'] = \
+                sp_pts_mask_path.split("/")[-1][:-4]
+        return results
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+        results = super().transform(results)
+        if self.with_sp_mask_3d:
+            results = self._load_sp_pts_3d(results)
+        return results
+
+
+@TRANSFORMS.register_module()
+class NormalizePointsColor_(NormalizePointsColor):
+    """Just add color_std parameter.
+
+    Args:
+        color_mean (list[float]): Mean color of the point cloud.
+        color_std (list[float]): Std color of the point cloud.
+            Default value is from SPFormer preprocessing.
+    """
+
+    def __init__(self, color_mean, color_std=127.5):
+        self.color_mean = color_mean
+        self.color_std = color_std
+
+    def transform(self, input_dict):
+        """Call function to normalize color of points.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the normalized points.
+            Updated key and value are described below.
+                - points (:obj:`BasePoints`): Points after color normalization.
+        """
+        points = input_dict['points']
+        assert points.attribute_dims is not None and \
+               'color' in points.attribute_dims.keys(), \
+               'Expect points have color attribute'
+        if self.color_mean is not None:
+            points.color = points.color - \
+                           points.color.new_tensor(self.color_mean)
+        if self.color_std is not None:
+            points.color = points.color / \
+                points.color.new_tensor(self.color_std)
+        input_dict['points'] = points
+        return input_dict
diff --git a/oneformer3d/mask_matrix_nms.py b/oneformer3d/mask_matrix_nms.py
new file mode 100644
index 0000000..59f45b8
--- /dev/null
+++ b/oneformer3d/mask_matrix_nms.py
@@ -0,0 +1,122 @@
+# This is a copy from mmdet/models/layers/matrix_nms.py.
+# We just change the input shape of `masks` tensor.
+import torch
+
+
+def mask_matrix_nms(masks,
+                    labels,
+                    scores,
+                    filter_thr=-1,
+                    nms_pre=-1,
+                    max_num=-1,
+                    kernel='gaussian',
+                    sigma=2.0,
+                    mask_area=None):
+    """Matrix NMS for multi-class masks.
+
+    Args:
+        masks (Tensor): Has shape (num_instances, m)
+        labels (Tensor): Labels of corresponding masks,
+            has shape (num_instances,).
+        scores (Tensor): Mask scores of corresponding masks,
+            has shape (num_instances).
+        filter_thr (float): Score threshold to filter the masks
+            after matrix nms. Default: -1, which means do not
+            use filter_thr.
+        nms_pre (int): The max number of instances to do the matrix nms.
+            Default: -1, which means do not use nms_pre.
+        max_num (int, optional): If there are more than max_num masks after
+            matrix, only top max_num will be kept. Default: -1, which means
+            do not use max_num.
+        kernel (str): 'linear' or 'gaussian'.
+        sigma (float): std in gaussian method.
+        mask_area (Tensor): The sum of seg_masks.
+
+    Returns:
+        tuple(Tensor): Processed mask results.
+
+            - scores (Tensor): Updated scores, has shape (n,).
+            - labels (Tensor): Remained labels, has shape (n,).
+            - masks (Tensor): Remained masks, has shape (n, m).
+            - keep_inds (Tensor): The indices number of
+                the remaining mask in the input mask, has shape (n,).
+    """
+    assert len(labels) == len(masks) == len(scores)
+    if len(labels) == 0:
+        return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros(
+            0, *masks.shape[-1:]), labels.new_zeros(0)
+    if mask_area is None:
+        mask_area = masks.sum(1).float()
+    else:
+        assert len(masks) == len(mask_area)
+
+    # sort and keep top nms_pre
+    scores, sort_inds = torch.sort(scores, descending=True)
+
+    keep_inds = sort_inds
+    if nms_pre > 0 and len(sort_inds) > nms_pre:
+        sort_inds = sort_inds[:nms_pre]
+        keep_inds = keep_inds[:nms_pre]
+        scores = scores[:nms_pre]
+    masks = masks[sort_inds]
+    mask_area = mask_area[sort_inds]
+    labels = labels[sort_inds]
+
+    num_masks = len(labels)
+    flatten_masks = masks.reshape(num_masks, -1).float()
+    # inter.
+    inter_matrix = torch.mm(flatten_masks, flatten_masks.transpose(1, 0))
+    expanded_mask_area = mask_area.expand(num_masks, num_masks)
+    # Upper triangle iou matrix.
+    iou_matrix = (inter_matrix /
+                  (expanded_mask_area + expanded_mask_area.transpose(1, 0) -
+                   inter_matrix)).triu(diagonal=1)
+    # label_specific matrix.
+    expanded_labels = labels.expand(num_masks, num_masks)
+    # Upper triangle label matrix.
+    label_matrix = (expanded_labels == expanded_labels.transpose(
+        1, 0)).triu(diagonal=1)
+
+    # IoU compensation
+    compensate_iou, _ = (iou_matrix * label_matrix).max(0)
+    compensate_iou = compensate_iou.expand(num_masks,
+                                           num_masks).transpose(1, 0)
+
+    # IoU decay
+    decay_iou = iou_matrix * label_matrix
+
+    # Calculate the decay_coefficient
+    if kernel == 'gaussian':
+        decay_matrix = torch.exp(-1 * sigma * (decay_iou**2))
+        compensate_matrix = torch.exp(-1 * sigma * (compensate_iou**2))
+        decay_coefficient, _ = (decay_matrix / compensate_matrix).min(0)
+    elif kernel == 'linear':
+        decay_matrix = (1 - decay_iou) / (1 - compensate_iou)
+        decay_coefficient, _ = decay_matrix.min(0)
+    else:
+        raise NotImplementedError(
+            f'{kernel} kernel is not supported in matrix nms!')
+    # update the score.
+    scores = scores * decay_coefficient
+
+    if filter_thr > 0:
+        keep = scores >= filter_thr
+        keep_inds = keep_inds[keep]
+        if not keep.any():
+            return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros(
+                0, *masks.shape[-1:]), labels.new_zeros(0)
+        masks = masks[keep]
+        scores = scores[keep]
+        labels = labels[keep]
+
+    # sort and keep top max_num
+    scores, sort_inds = torch.sort(scores, descending=True)
+    keep_inds = keep_inds[sort_inds]
+    if max_num > 0 and len(sort_inds) > max_num:
+        sort_inds = sort_inds[:max_num]
+        keep_inds = keep_inds[:max_num]
+        scores = scores[:max_num]
+    masks = masks[sort_inds]
+    labels = labels[sort_inds]
+
+    return scores, labels, masks, keep_inds
diff --git a/oneformer3d/mink_unet.py b/oneformer3d/mink_unet.py
new file mode 100644
index 0000000..b1de6bf
--- /dev/null
+++ b/oneformer3d/mink_unet.py
@@ -0,0 +1,597 @@
+# Adapted from JonasSchult/Mask3D.
+from enum import Enum
+from collections.abc import Sequence
+import torch.nn as nn
+import MinkowskiEngine as ME
+import MinkowskiEngine.MinkowskiOps as me
+from MinkowskiEngine import MinkowskiReLU
+
+from mmengine.model import BaseModule
+from mmdet3d.registry import MODELS
+
+
+class NormType(Enum):
+    BATCH_NORM = 0
+    INSTANCE_NORM = 1
+    INSTANCE_BATCH_NORM = 2
+
+
+def get_norm(norm_type, n_channels, D, bn_momentum=0.1):
+    if norm_type == NormType.BATCH_NORM:
+        return ME.MinkowskiBatchNorm(n_channels, momentum=bn_momentum)
+    elif norm_type == NormType.INSTANCE_NORM:
+        return ME.MinkowskiInstanceNorm(n_channels)
+    elif norm_type == NormType.INSTANCE_BATCH_NORM:
+        return nn.Sequential(
+            ME.MinkowskiInstanceNorm(n_channels),
+            ME.MinkowskiBatchNorm(n_channels, momentum=bn_momentum))
+    else:
+        raise ValueError(f"Norm type: {norm_type} not supported")
+
+
+class ConvType(Enum):
+    """
+    Define the kernel region type
+    """
+
+    HYPERCUBE = 0, "HYPERCUBE"
+    SPATIAL_HYPERCUBE = 1, "SPATIAL_HYPERCUBE"
+    SPATIO_TEMPORAL_HYPERCUBE = 2, "SPATIO_TEMPORAL_HYPERCUBE"
+    HYPERCROSS = 3, "HYPERCROSS"
+    SPATIAL_HYPERCROSS = 4, "SPATIAL_HYPERCROSS"
+    SPATIO_TEMPORAL_HYPERCROSS = 5, "SPATIO_TEMPORAL_HYPERCROSS"
+    SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS = (
+        6,
+        "SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS")
+
+    def __new__(cls, value, name):
+        member = object.__new__(cls)
+        member._value_ = value
+        member.fullname = name
+        return member
+
+    def __int__(self):
+        return self.value
+
+
+# Convert the ConvType var to a RegionType var
+conv_to_region_type = {
+    # kernel_size = [k, k, k, 1]
+    ConvType.HYPERCUBE: ME.RegionType.HYPER_CUBE,
+    ConvType.SPATIAL_HYPERCUBE: ME.RegionType.HYPER_CUBE,
+    ConvType.SPATIO_TEMPORAL_HYPERCUBE: ME.RegionType.HYPER_CUBE,
+    ConvType.HYPERCROSS: ME.RegionType.HYPER_CROSS,
+    ConvType.SPATIAL_HYPERCROSS: ME.RegionType.HYPER_CROSS,
+    ConvType.SPATIO_TEMPORAL_HYPERCROSS: ME.RegionType.HYPER_CROSS,
+    ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS: ME.RegionType.HYPER_CUBE
+}
+
+# int_to_region_type = {m.value: m for m in ME.RegionType}
+int_to_region_type = {m: ME.RegionType(m) for m in range(3)}
+
+
+def convert_region_type(region_type):
+    """Convert the integer region_type to the corresponding
+    RegionType enum object.
+    """
+    return int_to_region_type[region_type]
+
+
+def convert_conv_type(conv_type, kernel_size, D):
+    assert isinstance(conv_type, ConvType), "conv_type must be of ConvType"
+    region_type = conv_to_region_type[conv_type]
+    axis_types = None
+    if conv_type == ConvType.SPATIAL_HYPERCUBE:
+        # No temporal convolution
+        if isinstance(kernel_size, Sequence):
+            kernel_size = kernel_size[:3]
+        else:
+            kernel_size = [
+                kernel_size,
+            ] * 3
+        if D == 4:
+            kernel_size.append(1)
+    elif conv_type == ConvType.SPATIO_TEMPORAL_HYPERCUBE:
+        # conv_type conversion already handled
+        assert D == 4
+    elif conv_type == ConvType.HYPERCUBE:
+        # conv_type conversion already handled
+        pass
+    elif conv_type == ConvType.SPATIAL_HYPERCROSS:
+        if isinstance(kernel_size, Sequence):
+            kernel_size = kernel_size[:3]
+        else:
+            kernel_size = [
+                kernel_size,
+            ] * 3
+        if D == 4:
+            kernel_size.append(1)
+    elif conv_type == ConvType.HYPERCROSS:
+        # conv_type conversion already handled
+        pass
+    elif conv_type == ConvType.SPATIO_TEMPORAL_HYPERCROSS:
+        # conv_type conversion already handled
+        assert D == 4
+    elif conv_type == ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS:
+        # Define the CUBIC conv kernel for spatial dims
+        # and CROSS conv for temp dim
+        axis_types = [
+            ME.RegionType.HYPER_CUBE,
+        ] * 3
+        if D == 4:
+            axis_types.append(ME.RegionType.HYPER_CROSS)
+    return region_type, axis_types, kernel_size
+
+
+def conv(in_planes,
+         out_planes,
+         kernel_size,
+         stride=1,
+         dilation=1,
+         bias=False,
+         conv_type=ConvType.HYPERCUBE,
+         D=-1):
+    assert D > 0, "Dimension must be a positive integer"
+    region_type, axis_types, kernel_size = convert_conv_type(
+        conv_type, kernel_size, D)
+    kernel_generator = ME.KernelGenerator(
+        kernel_size,
+        stride,
+        dilation,
+        region_type=region_type,
+        axis_types=None,  # axis_types JONAS
+        dimension=D)
+
+    return ME.MinkowskiConvolution(
+        in_channels=in_planes,
+        out_channels=out_planes,
+        kernel_size=kernel_size,
+        stride=stride,
+        dilation=dilation,
+        bias=bias,
+        kernel_generator=kernel_generator,
+        dimension=D)
+
+
+def conv_tr(in_planes,
+            out_planes,
+            kernel_size,
+            upsample_stride=1,
+            dilation=1,
+            bias=False,
+            conv_type=ConvType.HYPERCUBE,
+            D=-1):
+    assert D > 0, "Dimension must be a positive integer"
+    region_type, axis_types, kernel_size = convert_conv_type(
+        conv_type, kernel_size, D)
+    kernel_generator = ME.KernelGenerator(
+        kernel_size,
+        upsample_stride,
+        dilation,
+        region_type=region_type,
+        axis_types=axis_types,
+        dimension=D)
+
+    return ME.MinkowskiConvolutionTranspose(
+        in_channels=in_planes,
+        out_channels=out_planes,
+        kernel_size=kernel_size,
+        stride=upsample_stride,
+        dilation=dilation,
+        bias=bias,
+        kernel_generator=kernel_generator,
+        dimension=D)
+
+
+class BasicBlockBase(nn.Module):
+    expansion = 1
+    NORM_TYPE = NormType.BATCH_NORM
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 conv_type=ConvType.HYPERCUBE,
+                 bn_momentum=0.1,
+                 D=3):
+        super().__init__()
+
+        self.conv1 = conv(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            dilation=dilation,
+            conv_type=conv_type,
+            D=D)
+        self.norm1 = get_norm(
+            self.NORM_TYPE, planes, D, bn_momentum=bn_momentum)
+        self.conv2 = conv(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=1,
+            dilation=dilation,
+            bias=False,
+            conv_type=conv_type,
+            D=D)
+        self.norm2 = get_norm(
+            self.NORM_TYPE, planes, D, bn_momentum=bn_momentum)
+        self.relu = MinkowskiReLU(inplace=True)
+        self.downsample = downsample
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.norm2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BasicBlock(BasicBlockBase):
+    NORM_TYPE = NormType.BATCH_NORM
+
+
+class Res16UNetBase(BaseModule):
+    """Base class for Minkowski U-Net.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channles (int): Number of output channels.
+        config (dict): Extra parameters including
+            `dilations`, `conv1_kernel_size`, `bn_momentum`.
+        D (int): Conv dimension.
+    """
+    BLOCK = None
+    PLANES = (32, 64, 128, 256, 256, 256, 256, 256)
+    DILATIONS = (1, 1, 1, 1, 1, 1, 1, 1)
+    LAYERS = (2, 2, 2, 2, 2, 2, 2, 2)
+    INIT_DIM = 32
+    OUT_PIXEL_DIST = 1
+    NORM_TYPE = NormType.BATCH_NORM
+    NON_BLOCK_CONV_TYPE = ConvType.SPATIAL_HYPERCUBE
+    CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 config,
+                 D=3,
+                 **kwargs):
+        self.D = D
+        super().__init__()
+        self.network_initialization(in_channels, out_channels, config, D)
+        self.weight_initialization()
+
+    def weight_initialization(self):
+        for m in self.modules():
+            if isinstance(m, ME.MinkowskiBatchNorm):
+                nn.init.constant_(m.bn.weight, 1)
+                nn.init.constant_(m.bn.bias, 0)
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    stride=1,
+                    dilation=1,
+                    norm_type=NormType.BATCH_NORM,
+                    bn_momentum=0.1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                    D=self.D),
+                get_norm(
+                    norm_type,
+                    planes * block.expansion,
+                    D=self.D,
+                    bn_momentum=bn_momentum))
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride=stride,
+                dilation=dilation,
+                downsample=downsample,
+                conv_type=self.CONV_TYPE,
+                D=self.D))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    stride=1,
+                    dilation=dilation,
+                    conv_type=self.CONV_TYPE,
+                    D=self.D))
+
+        return nn.Sequential(*layers)
+
+    def network_initialization(self, in_channels, out_channels, config, D):
+        # Setup net_metadata
+        dilations = self.DILATIONS
+        bn_momentum = config.bn_momentum
+
+        def space_n_time_m(n, m):
+            return n if D == 3 else [n, n, n, m]
+
+        if D == 4:
+            self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1)
+
+        # Output of the first conv concated to conv6
+        self.inplanes = self.INIT_DIM
+        self.conv0p1s1 = conv(
+            in_channels,
+            self.inplanes,
+            kernel_size=space_n_time_m(config.conv1_kernel_size, 1),
+            stride=1,
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D)
+
+        self.bn0 = get_norm(
+            self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+
+        self.conv1p1s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D)
+        self.bn1 = get_norm(
+            self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block1 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[0],
+            self.LAYERS[0],
+            dilation=dilations[0],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum)
+
+        self.conv2p2s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D)
+        self.bn2 = get_norm(
+            self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block2 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[1],
+            self.LAYERS[1],
+            dilation=dilations[1],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum)
+
+        self.conv3p4s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D)
+        self.bn3 = get_norm(
+            self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block3 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[2],
+            self.LAYERS[2],
+            dilation=dilations[2],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum)
+
+        self.conv4p8s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D)
+        self.bn4 = get_norm(
+            self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block4 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[3],
+            self.LAYERS[3],
+            dilation=dilations[3],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum)
+        self.convtr4p16s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[4],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D)
+        self.bntr4 = get_norm(
+            self.NORM_TYPE, self.PLANES[4], D, bn_momentum=bn_momentum)
+
+        self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion
+        self.block5 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[4],
+            self.LAYERS[4],
+            dilation=dilations[4],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum)
+        self.convtr5p8s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[5],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D)
+        self.bntr5 = get_norm(
+            self.NORM_TYPE, self.PLANES[5], D, bn_momentum=bn_momentum)
+
+        self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion
+        self.block6 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[5],
+            self.LAYERS[5],
+            dilation=dilations[5],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum)
+        self.convtr6p4s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[6],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D)
+        self.bntr6 = get_norm(
+            self.NORM_TYPE, self.PLANES[6], D, bn_momentum=bn_momentum)
+
+        self.inplanes = self.PLANES[6] + self.PLANES[0] * self.BLOCK.expansion
+        self.block7 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[6],
+            self.LAYERS[6],
+            dilation=dilations[6],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum)
+        self.convtr7p2s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[7],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D)
+        self.bntr7 = get_norm(
+            self.NORM_TYPE, self.PLANES[7], D, bn_momentum=bn_momentum)
+
+        self.inplanes = self.PLANES[7] + self.INIT_DIM
+        self.block8 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[7],
+            self.LAYERS[7],
+            dilation=dilations[7],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum)
+
+        self.final = conv(
+            self.PLANES[7],
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True,
+            D=D)
+        self.relu = MinkowskiReLU(inplace=True)
+
+    def forward(self, x):
+        feature_maps = []
+
+        out = self.conv0p1s1(x)
+        out = self.bn0(out)
+        out_p1 = self.relu(out)
+
+        out = self.conv1p1s2(out_p1)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out_b1p2 = self.block1(out)
+
+        out = self.conv2p2s2(out_b1p2)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out_b2p4 = self.block2(out)
+
+        out = self.conv3p4s2(out_b2p4)
+        out = self.bn3(out)
+        out = self.relu(out)
+        out_b3p8 = self.block3(out)
+
+        # pixel_dist=16
+        out = self.conv4p8s2(out_b3p8)
+        out = self.bn4(out)
+        out = self.relu(out)
+        out = self.block4(out)
+
+        feature_maps.append(out)
+
+        # pixel_dist=8
+        out = self.convtr4p16s2(out)
+        out = self.bntr4(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b3p8)
+        out = self.block5(out)
+
+        feature_maps.append(out)
+
+        # pixel_dist=4
+        out = self.convtr5p8s2(out)
+        out = self.bntr5(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b2p4)
+        out = self.block6(out)
+
+        feature_maps.append(out)
+
+        # pixel_dist=2
+        out = self.convtr6p4s2(out)
+        out = self.bntr6(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b1p2)
+        out = self.block7(out)
+
+        feature_maps.append(out)
+
+        # pixel_dist=1
+        out = self.convtr7p2s2(out)
+        out = self.bntr7(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_p1)
+        out = self.block8(out)
+
+        feature_maps.append(out)
+
+        return out
+
+
+class Res16UNet34(Res16UNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 3, 4, 6, 2, 2, 2, 2)
+
+
+@MODELS.register_module()
+class Res16UNet34C(Res16UNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 96, 96)
diff --git a/oneformer3d/oneformer3d.py b/oneformer3d/oneformer3d.py
new file mode 100644
index 0000000..fdfa33e
--- /dev/null
+++ b/oneformer3d/oneformer3d.py
@@ -0,0 +1,1346 @@
+import torch
+import torch.nn.functional as F
+import spconv.pytorch as spconv
+from torch_scatter import scatter_mean
+import MinkowskiEngine as ME
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import PointData
+from mmdet3d.models import Base3DDetector
+from .mask_matrix_nms import mask_matrix_nms
+
+
+class ScanNetOneFormer3DMixin:
+    """Class contains common methods for ScanNet and ScanNet200."""
+
+    def predict_by_feat(self, out, superpoints):
+        """Predict instance, semantic, and panoptic masks for a single scene.
+
+        Args:
+            out (Dict): Decoder output, each value is List of len 1. Keys:
+                `cls_preds` of shape (n_queries, n_instance_classes + 1),
+                `sem_preds` of shape (n_queries, n_semantic_classes + 1),
+                `masks` of shape (n_queries, n_points),
+                `scores` of shape (n_queris, 1) or None.
+            superpoints (Tensor): of shape (n_raw_points,).
+        
+        Returns:
+            List[PointData]: of len 1 with `pts_semantic_mask`,
+                `pts_instance_mask`, `instance_labels`, `instance_scores`.
+        """
+        inst_res = self.predict_by_feat_instance(
+            out, superpoints, self.test_cfg.inst_score_thr)
+        sem_res = self.predict_by_feat_semantic(out, superpoints)
+        pan_res = self.predict_by_feat_panoptic(out, superpoints)
+
+        pts_semantic_mask = [sem_res.cpu().numpy(), pan_res[0].cpu().numpy()]
+        pts_instance_mask = [inst_res[0].cpu().bool().numpy(),
+                             pan_res[1].cpu().numpy()]
+      
+        return [
+            PointData(
+                pts_semantic_mask=pts_semantic_mask,
+                pts_instance_mask=pts_instance_mask,
+                instance_labels=inst_res[1].cpu().numpy(),
+                instance_scores=inst_res[2].cpu().numpy())]
+    
+    def predict_by_feat_instance(self, out, superpoints, score_threshold):
+        """Predict instance masks for a single scene.
+
+        Args:
+            out (Dict): Decoder output, each value is List of len 1. Keys:
+                `cls_preds` of shape (n_queries, n_instance_classes + 1),
+                `masks` of shape (n_queries, n_points),
+                `scores` of shape (n_queris, 1) or None.
+            superpoints (Tensor): of shape (n_raw_points,).
+            score_threshold (float): minimal score for predicted object.
+        
+        Returns:
+            Tuple:
+                Tensor: mask_preds of shape (n_preds, n_raw_points),
+                Tensor: labels of shape (n_preds,),
+                Tensor: scors of shape (n_preds,).
+        """
+        cls_preds = out['cls_preds'][0]
+        pred_masks = out['masks'][0]
+
+        scores = F.softmax(cls_preds, dim=-1)[:, :-1]
+        if out['scores'][0] is not None:
+            scores *= out['scores'][0]
+        labels = torch.arange(
+            self.num_classes,
+            device=scores.device).unsqueeze(0).repeat(
+                len(cls_preds), 1).flatten(0, 1)
+        scores, topk_idx = scores.flatten(0, 1).topk(
+            self.test_cfg.topk_insts, sorted=False)
+        labels = labels[topk_idx]
+
+        topk_idx = torch.div(topk_idx, self.num_classes, rounding_mode='floor')
+        mask_pred = pred_masks
+        mask_pred = mask_pred[topk_idx]
+        mask_pred_sigmoid = mask_pred.sigmoid()
+
+        if self.test_cfg.get('obj_normalization', None):
+            mask_scores = (mask_pred_sigmoid * (mask_pred > 0)).sum(1) / \
+                ((mask_pred > 0).sum(1) + 1e-6)
+            scores = scores * mask_scores
+
+        if self.test_cfg.get('nms', None):
+            kernel = self.test_cfg.matrix_nms_kernel
+            scores, labels, mask_pred_sigmoid, _ = mask_matrix_nms(
+                mask_pred_sigmoid, labels, scores, kernel=kernel)
+
+        mask_pred_sigmoid = mask_pred_sigmoid[:, superpoints]
+        mask_pred = mask_pred_sigmoid > self.test_cfg.sp_score_thr
+
+        # score_thr
+        score_mask = scores > score_threshold
+        scores = scores[score_mask]
+        labels = labels[score_mask]
+        mask_pred = mask_pred[score_mask]
+
+        # npoint_thr
+        mask_pointnum = mask_pred.sum(1)
+        npoint_mask = mask_pointnum > self.test_cfg.npoint_thr
+        scores = scores[npoint_mask]
+        labels = labels[npoint_mask]
+        mask_pred = mask_pred[npoint_mask]
+
+        return mask_pred, labels, scores
+
+    def predict_by_feat_semantic(self, out, superpoints, classes=None):
+        """Predict semantic masks for a single scene.
+
+        Args:
+            out (Dict): Decoder output, each value is List of len 1. Keys:
+                `sem_preds` of shape (n_queries, n_semantic_classes + 1).
+            superpoints (Tensor): of shape (n_raw_points,).
+            classes (List[int] or None): semantic (stuff) class ids.
+        
+        Returns:
+            Tensor: semantic preds of shape
+                (n_raw_points, n_semantic_classe + 1),
+        """
+        if classes is None:
+            classes = list(range(out['sem_preds'][0].shape[1] - 1))
+        return out['sem_preds'][0][:, classes].argmax(dim=1)[superpoints]
+
+    def predict_by_feat_panoptic(self, out, superpoints):
+        """Predict panoptic masks for a single scene.
+
+        Args:
+            out (Dict): Decoder output, each value is List of len 1. Keys:
+                `cls_preds` of shape (n_queries, n_instance_classes + 1),
+                `sem_preds` of shape (n_queries, n_semantic_classes + 1),
+                `masks` of shape (n_queries, n_points),
+                `scores` of shape (n_queris, 1) or None.
+            superpoints (Tensor): of shape (n_raw_points,).
+        
+        Returns:
+            Tuple:
+                Tensor: semantic mask of shape (n_raw_points,),
+                Tensor: instance mask of shape (n_raw_points,).
+        """
+        sem_map = self.predict_by_feat_semantic(
+            out, superpoints, self.test_cfg.stuff_classes)
+        mask_pred, labels, scores  = self.predict_by_feat_instance(
+            out, superpoints, self.test_cfg.pan_score_thr)
+        if mask_pred.shape[0] == 0:
+            return sem_map, sem_map
+
+        scores, idxs = scores.sort()
+        labels = labels[idxs]
+        mask_pred = mask_pred[idxs]
+
+        n_stuff_classes = len(self.test_cfg.stuff_classes)
+        inst_idxs = torch.arange(
+            n_stuff_classes, 
+            mask_pred.shape[0] + n_stuff_classes, 
+            device=mask_pred.device).view(-1, 1)
+        insts = inst_idxs * mask_pred
+        things_inst_mask, idxs = insts.max(axis=0)
+        things_sem_mask = labels[idxs] + n_stuff_classes
+
+        inst_idxs, num_pts = things_inst_mask.unique(return_counts=True)
+        for inst, pts in zip(inst_idxs, num_pts):
+            if pts <= self.test_cfg.npoint_thr and inst != 0:
+                things_inst_mask[things_inst_mask == inst] = 0
+
+        things_sem_mask[things_inst_mask == 0] = 0
+      
+        sem_map[things_inst_mask != 0] = 0
+        inst_map = sem_map.clone()
+        inst_map += things_inst_mask
+        sem_map += things_sem_mask
+        return sem_map, inst_map
+    
+    def _select_queries(self, x, gt_instances):
+        """Select queries for train pass.
+
+        Args:
+            x (List[Tensor]): of len batch_size, each of shape
+                (n_points_i, n_channels).
+            gt_instances (List[InstanceData_]): of len batch_size.
+                Ground truth which can contain `labels` of shape (n_gts_i,),
+                `sp_masks` of shape (n_gts_i, n_points_i).
+
+        Returns:
+            Tuple:
+                List[Tensor]: Queries of len batch_size, each queries of shape
+                    (n_queries_i, n_channels).
+                List[InstanceData_]: of len batch_size, each updated
+                    with `query_masks` of shape (n_gts_i, n_queries_i).
+        """
+        queries = []
+        for i in range(len(x)):
+            if self.query_thr < 1:
+                n = (1 - self.query_thr) * torch.rand(1) + self.query_thr
+                n = (n * len(x[i])).int()
+                ids = torch.randperm(len(x[i]))[:n].to(x[i].device)
+                queries.append(x[i][ids])
+                gt_instances[i].query_masks = gt_instances[i].sp_masks[:, ids]
+            else:
+                queries.append(x[i])
+                gt_instances[i].query_masks = gt_instances[i].sp_masks
+        return queries, gt_instances
+
+
+@MODELS.register_module()
+class ScanNetOneFormer3D(ScanNetOneFormer3DMixin, Base3DDetector):
+    r"""OneFormer3D for ScanNet dataset.
+
+    Args:
+        in_channels (int): Number of input channels.
+        num_channels (int): NUmber of output channels.
+        voxel_size (float): Voxel size.
+        num_classes (int): Number of classes.
+        min_spatial_shape (int): Minimal shape for spconv tensor.
+        query_thr (float): We select >= query_thr * n_queries queries
+            for training and all n_queries for testing.
+        backbone (ConfigDict): Config dict of the backbone.
+        decoder (ConfigDict): Config dict of the decoder.
+        criterion (ConfigDict): Config dict of the criterion.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or ConfigDict, optional): the config to control the
+            initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_channels,
+                 voxel_size,
+                 num_classes,
+                 min_spatial_shape,
+                 query_thr,
+                 backbone=None,
+                 decoder=None,
+                 criterion=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 data_preprocessor=None,
+                 init_cfg=None):
+        super(Base3DDetector, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.unet = MODELS.build(backbone)
+        self.decoder = MODELS.build(decoder)
+        self.criterion = MODELS.build(criterion)
+        self.voxel_size = voxel_size
+        self.num_classes = num_classes
+        self.min_spatial_shape = min_spatial_shape
+        self.query_thr = query_thr
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers(in_channels, num_channels)
+    
+    def _init_layers(self, in_channels, num_channels):
+        self.input_conv = spconv.SparseSequential(
+            spconv.SubMConv3d(
+                in_channels,
+                num_channels,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+                indice_key='subm1'))
+        self.output_layer = spconv.SparseSequential(
+            torch.nn.BatchNorm1d(num_channels, eps=1e-4, momentum=0.1),
+            torch.nn.ReLU(inplace=True))
+
+    def extract_feat(self, x, superpoints, inverse_mapping, batch_offsets):
+        """Extract features from sparse tensor.
+
+        Args:
+            x (SparseTensor): Input sparse tensor of shape
+                (n_points, in_channels).
+            superpoints (Tensor): of shape (n_points,).
+            inverse_mapping (Tesnor): of shape (n_points,).
+            batch_offsets (List[int]): of len batch_size + 1.
+
+        Returns:
+            List[Tensor]: of len batch_size,
+                each of shape (n_points_i, n_channels).
+        """
+        x = self.input_conv(x)
+        x, _ = self.unet(x)
+        x = self.output_layer(x)
+        x = scatter_mean(x.features[inverse_mapping], superpoints, dim=0)
+        out = []
+        for i in range(len(batch_offsets) - 1):
+            out.append(x[batch_offsets[i]: batch_offsets[i + 1]])
+        return out
+
+    def collate(self, points, elastic_points=None):
+        """Collate batch of points to sparse tensor.
+
+        Args:
+            points (List[Tensor]): Batch of points.
+            quantization_mode (SparseTensorQuantizationMode): Minkowski
+                quantization mode. We use random sample for training
+                and unweighted average for inference.
+
+        Returns:
+            TensorField: Containing features and coordinates of a
+                sparse tensor.
+        """
+        if elastic_points is None:
+            coordinates, features = ME.utils.batch_sparse_collate(
+                [((p[:, :3] - p[:, :3].min(0)[0]) / self.voxel_size,
+                  torch.hstack((p[:, 3:], p[:, :3] - p[:, :3].mean(0))))
+                 for p in points])
+        else:
+            coordinates, features = ME.utils.batch_sparse_collate(
+                [((el_p - el_p.min(0)[0]),
+                  torch.hstack((p[:, 3:], p[:, :3] - p[:, :3].mean(0))))
+                 for el_p, p in zip(elastic_points, points)])
+        
+        spatial_shape = torch.clip(
+            coordinates.max(0)[0][1:] + 1, self.min_spatial_shape)
+        field = ME.TensorField(features=features, coordinates=coordinates)
+        tensor = field.sparse()
+        coordinates = tensor.coordinates
+        features = tensor.features
+        inverse_mapping = field.inverse_mapping(tensor.coordinate_map_key)
+
+        return coordinates, features, inverse_mapping, spatial_shape
+
+    def _forward(*args, **kwargs):
+        """Implement abstract method of Base3DDetector."""
+        pass
+
+    def loss(self, batch_inputs_dict, batch_data_samples, **kwargs):
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `points` key.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It includes information such as
+                `gt_instances_3d` and `gt_sem_seg_3d`.
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_offsets = [0]
+        superpoint_bias = 0
+        sp_gt_instances = []
+        sp_pts_masks = []
+        for i in range(len(batch_data_samples)):
+            gt_pts_seg = batch_data_samples[i].gt_pts_seg
+
+            gt_pts_seg.sp_pts_mask += superpoint_bias
+            superpoint_bias = gt_pts_seg.sp_pts_mask.max().item() + 1
+            batch_offsets.append(superpoint_bias)
+
+            sp_gt_instances.append(batch_data_samples[i].gt_instances_3d)
+            sp_pts_masks.append(gt_pts_seg.sp_pts_mask)
+
+        coordinates, features, inverse_mapping, spatial_shape = self.collate(
+            batch_inputs_dict['points'],
+            batch_inputs_dict.get('elastic_coords', None))
+
+        x = spconv.SparseConvTensor(
+            features, coordinates, spatial_shape, len(batch_data_samples))
+        sp_pts_masks = torch.hstack(sp_pts_masks)
+        x = self.extract_feat(
+            x, sp_pts_masks, inverse_mapping, batch_offsets)
+        queries, sp_gt_instances = self._select_queries(x, sp_gt_instances)
+        x = self.decoder(x, queries)
+        loss = self.criterion(x, sp_gt_instances)
+        return loss
+    
+    def predict(self, batch_inputs_dict, batch_data_samples, **kwargs):
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `points` key.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It includes information such as
+                `gt_instance_3d` and `gt_sem_seg_3d`.
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input samples. Each Det3DDataSample contains 'pred_pts_seg'.
+            And the `pred_pts_seg` contains following keys.
+                - instance_scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - instance_labels (Tensor): Labels of instances, has a shape
+                    (num_instances, )
+                - pts_instance_mask (Tensor): Instance mask, has a shape
+                    (num_points, num_instances) of type bool.
+        """
+        batch_offsets = [0]
+        superpoint_bias = 0
+        sp_pts_masks = []
+        for i in range(len(batch_data_samples)):
+            gt_pts_seg = batch_data_samples[i].gt_pts_seg
+            gt_pts_seg.sp_pts_mask += superpoint_bias
+            superpoint_bias = gt_pts_seg.sp_pts_mask.max().item() + 1
+            batch_offsets.append(superpoint_bias)
+            sp_pts_masks.append(gt_pts_seg.sp_pts_mask)
+
+        coordinates, features, inverse_mapping, spatial_shape = self.collate(
+            batch_inputs_dict['points'])
+
+        x = spconv.SparseConvTensor(
+            features, coordinates, spatial_shape, len(batch_data_samples))
+        sp_pts_masks = torch.hstack(sp_pts_masks)
+        x = self.extract_feat(
+            x, sp_pts_masks, inverse_mapping, batch_offsets)
+        x = self.decoder(x, x)
+
+        results_list = self.predict_by_feat(x, sp_pts_masks)
+        for i, data_sample in enumerate(batch_data_samples):
+            data_sample.pred_pts_seg = results_list[i]
+        return batch_data_samples
+
+
+@MODELS.register_module()
+class ScanNet200OneFormer3D(ScanNetOneFormer3DMixin, Base3DDetector):
+    """OneFormer3D for ScanNet200 dataset.
+    
+    Args:
+        voxel_size (float): Voxel size.
+        num_classes (int): Number of classes.
+        query_thr (float): Min percent of queries.
+        backbone (ConfigDict): Config dict of the backbone.
+        neck (ConfigDict, optional): Config dict of the neck.
+        decoder (ConfigDict): Config dict of the decoder.
+        criterion (ConfigDict): Config dict of the criterion.
+        matcher (ConfigDict): To match superpoints to objects.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or ConfigDict, optional): the config to control the
+            initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 voxel_size,
+                 num_classes,
+                 query_thr,
+                 backbone=None,
+                 neck=None,
+                 decoder=None,
+                 criterion=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 data_preprocessor=None,
+                 init_cfg=None):
+        super(Base3DDetector, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        self.decoder = MODELS.build(decoder)
+        self.criterion = MODELS.build(criterion)
+        self.voxel_size = voxel_size
+        self.num_classes = num_classes
+        self.query_thr = query_thr
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feat(self, batch_inputs_dict, batch_data_samples):
+        """Extract features from sparse tensor.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `points` key.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It includes information such as
+                `gt_pts_seg.sp_pts_mask`.
+
+        Returns:
+            Tuple:
+                List[Tensor]: of len batch_size,
+                    each of shape (n_points_i, n_channels).
+                List[Tensor]: of len batch_size,
+                    each of shape (n_points_i, n_classes + 1).
+        """
+        # construct tensor field
+        coordinates, features = [], []
+        for i in range(len(batch_inputs_dict['points'])):
+            if 'elastic_coords' in batch_inputs_dict:
+                coordinates.append(
+                    batch_inputs_dict['elastic_coords'][i] * self.voxel_size)
+            else:
+                coordinates.append(batch_inputs_dict['points'][i][:, :3])
+            features.append(batch_inputs_dict['points'][i][:, 3:])
+        
+        coordinates, features = ME.utils.batch_sparse_collate(
+            [(c / self.voxel_size, f) for c, f in zip(coordinates, features)],
+            device=coordinates[0].device)
+        field = ME.TensorField(coordinates=coordinates, features=features)
+
+        # forward of backbone and neck
+        x = self.backbone(field.sparse())
+        if self.with_neck:
+            x = self.neck(x)
+        x = x.slice(field).features
+
+        # apply scatter_mean
+        sp_pts_masks, n_super_points = [], []
+        for data_sample in batch_data_samples:
+            sp_pts_mask = data_sample.gt_pts_seg.sp_pts_mask
+            sp_pts_masks.append(sp_pts_mask + sum(n_super_points))
+            n_super_points.append(sp_pts_mask.max() + 1)
+        x = scatter_mean(x, torch.cat(sp_pts_masks), dim=0)  # todo: do we need dim?
+
+        # apply cls_layer
+        features = []
+        for i in range(len(n_super_points)):
+            begin = sum(n_super_points[:i])
+            end = sum(n_super_points[:i + 1])
+            features.append(x[begin: end])
+        return features
+
+    def _forward(*args, **kwargs):
+        """Implement abstract method of Base3DDetector."""
+        pass
+
+    def loss(self, batch_inputs_dict, batch_data_samples, **kwargs):
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `points` key.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It includes information such as
+                `gt_instances_3d` and `gt_sem_seg_3d`.
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs_dict, batch_data_samples)
+        gt_instances = [s.gt_instances_3d for s in batch_data_samples]
+        queries, gt_instances = self._select_queries(x, gt_instances)
+        x = self.decoder(x, queries)
+        return self.criterion(x, gt_instances)
+
+    def predict(self, batch_inputs_dict, batch_data_samples, **kwargs):
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `points` key.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It includes information such as
+                `gt_pts_seg.sp_pts_mask`.
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input samples. Each Det3DDataSample contains 'pred_pts_seg'.
+            And the `pred_pts_seg` contains following keys.
+                - instance_scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - instance_labels (Tensor): Labels of instances, has a shape
+                    (num_instances, )
+                - pts_instance_mask (Tensor): Instance mask, has a shape
+                    (num_points, num_instances) of type bool.
+        """
+        assert len(batch_data_samples) == 1
+        x = self.extract_feat(batch_inputs_dict, batch_data_samples)
+        x = self.decoder(x, x)
+        pred_pts_seg = self.predict_by_feat(
+            x, batch_data_samples[0].gt_pts_seg.sp_pts_mask)
+        batch_data_samples[0].pred_pts_seg = pred_pts_seg[0]
+        return batch_data_samples
+
+
+@MODELS.register_module()
+class S3DISOneFormer3D(Base3DDetector):
+    r"""OneFormer3D for S3DIS dataset.
+
+    Args:
+        in_channels (int): Number of input channels.
+        num_channels (int): NUmber of output channels.
+        voxel_size (float): Voxel size.
+        num_classes (int): Number of classes.
+        min_spatial_shape (int): Minimal shape for spconv tensor.
+        backbone (ConfigDict): Config dict of the backbone.
+        decoder (ConfigDict): Config dict of the decoder.
+        criterion (ConfigDict): Config dict of the criterion.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or ConfigDict, optional): the config to control the
+            initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_channels,
+                 voxel_size,
+                 num_classes,
+                 min_spatial_shape,
+                 backbone=None,
+                 decoder=None,
+                 criterion=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 data_preprocessor=None,
+                 init_cfg=None):
+        super(Base3DDetector, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.unet = MODELS.build(backbone)
+        self.decoder = MODELS.build(decoder)
+        self.criterion = MODELS.build(criterion)
+        self.voxel_size = voxel_size
+        self.num_classes = num_classes
+        self.min_spatial_shape = min_spatial_shape
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers(in_channels, num_channels)
+
+    def _init_layers(self, in_channels, num_channels):
+        self.input_conv = spconv.SparseSequential(
+            spconv.SubMConv3d(
+                in_channels,
+                num_channels,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+                indice_key='subm1'))
+        self.output_layer = spconv.SparseSequential(
+            torch.nn.BatchNorm1d(num_channels, eps=1e-4, momentum=0.1),
+            torch.nn.ReLU(inplace=True))
+
+    def extract_feat(self, x):
+        """Extract features from sparse tensor.
+
+        Args:
+            x (SparseTensor): Input sparse tensor of shape
+                (n_points, in_channels).
+
+        Returns:
+            List[Tensor]: of len batch_size,
+                each of shape (n_points_i, n_channels).
+        """
+        x = self.input_conv(x)
+        x, _ = self.unet(x)
+        x = self.output_layer(x)
+        out = []
+        for i in x.indices[:, 0].unique():
+            out.append(x.features[x.indices[:, 0] == i])
+        return out
+
+    def collate(self, points, elastic_points=None):
+        """Collate batch of points to sparse tensor.
+
+        Args:
+            points (List[Tensor]): Batch of points.
+            quantization_mode (SparseTensorQuantizationMode): Minkowski
+                quantization mode. We use random sample for training
+                and unweighted average for inference.
+
+        Returns:
+            TensorField: Containing features and coordinates of a
+                sparse tensor.
+        """
+        if elastic_points is None:
+            coordinates, features = ME.utils.batch_sparse_collate(
+                [((p[:, :3] - p[:, :3].min(0)[0]) / self.voxel_size,
+                  torch.hstack((p[:, 3:], p[:, :3] - p[:, :3].mean(0))))
+                 for p in points])
+        else:
+            coordinates, features = ME.utils.batch_sparse_collate(
+                [((el_p - el_p.min(0)[0]),
+                  torch.hstack((p[:, 3:], p[:, :3] - p[:, :3].mean(0))))
+                 for el_p, p in zip(elastic_points, points)])
+
+        spatial_shape = torch.clip(
+            coordinates.max(0)[0][1:] + 1, self.min_spatial_shape)
+        field = ME.TensorField(features=features, coordinates=coordinates)
+        tensor = field.sparse()
+        coordinates = tensor.coordinates
+        features = tensor.features
+        inverse_mapping = field.inverse_mapping(tensor.coordinate_map_key)
+
+        return coordinates, features, inverse_mapping, spatial_shape
+
+    def _forward(*args, **kwargs):
+        """Implement abstract method of Base3DDetector."""
+        pass
+
+    def loss(self, batch_inputs_dict, batch_data_samples, **kwargs):
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `points` key.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It includes information such as
+                `gt_instances_3d` and `gt_sem_seg_3d`.
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_offsets = [0]
+        superpoint_bias = 0
+        sp_gt_instances = []
+        sp_pts_masks = []
+        for i in range(len(batch_data_samples)):
+            gt_pts_seg = batch_data_samples[i].gt_pts_seg
+
+            gt_pts_seg.sp_pts_mask += superpoint_bias
+            superpoint_bias = gt_pts_seg.sp_pts_mask.max().item() + 1
+            batch_offsets.append(superpoint_bias)
+
+            sp_gt_instances.append(batch_data_samples[i].gt_instances_3d)
+            sp_pts_masks.append(gt_pts_seg.sp_pts_mask)
+
+        coordinates, features, inverse_mapping, spatial_shape = self.collate(
+            batch_inputs_dict['points'],
+            batch_inputs_dict.get('elastic_coords', None))
+        x = spconv.SparseConvTensor(
+            features, coordinates, spatial_shape, len(batch_data_samples))
+
+        sp_pts_masks = torch.hstack(sp_pts_masks)
+
+        x = self.extract_feat(
+            x, sp_pts_masks, inverse_mapping, batch_offsets)
+        queries, sp_gt_instances = self._select_queries(x, sp_gt_instances)
+        x = self.decoder(x, queries)
+
+        loss = self.criterion(x, sp_gt_instances)
+        return loss
+
+    def loss(self, batch_inputs_dict, batch_data_samples, **kwargs):
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `points` key.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It includes information such as
+                `gt_instances_3d` and `gt_sem_seg_3d`.
+        Returns:
+            dict: A dictionary of loss components.
+        """
+
+        coordinates, features, inverse_mapping, spatial_shape = self.collate(
+            batch_inputs_dict['points'],
+            batch_inputs_dict.get('elastic_coords', None))
+        x = spconv.SparseConvTensor(
+            features, coordinates, spatial_shape, len(batch_data_samples))
+
+        x = self.extract_feat(x)
+
+        x = self.decoder(x)
+
+        sp_gt_instances = []
+        for i in range(len(batch_data_samples)):
+            voxel_superpoints = inverse_mapping[coordinates[:, 0][ \
+                                                        inverse_mapping] == i]
+            voxel_superpoints = torch.unique(voxel_superpoints,
+                                             return_inverse=True)[1]
+            inst_mask = batch_data_samples[i].gt_pts_seg.pts_instance_mask
+            sem_mask = batch_data_samples[i].gt_pts_seg.pts_semantic_mask
+            assert voxel_superpoints.shape == inst_mask.shape
+
+            batch_data_samples[i].gt_instances_3d.sp_sem_masks = \
+                                self.get_gt_semantic_masks(sem_mask,
+                                                            voxel_superpoints,
+                                                            self.num_classes)
+            batch_data_samples[i].gt_instances_3d.sp_inst_masks = \
+                                self.get_gt_inst_masks(inst_mask,
+                                                       voxel_superpoints)
+            sp_gt_instances.append(batch_data_samples[i].gt_instances_3d)
+
+        loss = self.criterion(x, sp_gt_instances)
+        return loss
+
+    def predict(self, batch_inputs_dict, batch_data_samples, **kwargs):
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `points` key.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It includes information such as
+                `gt_instance_3d` and `gt_sem_seg_3d`.
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input samples. Each Det3DDataSample contains 'pred_pts_seg'.
+            And the `pred_pts_seg` contains following keys.
+                - instance_scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - instance_labels (Tensor): Labels of instances, has a shape
+                    (num_instances, )
+                - pts_instance_mask (Tensor): Instance mask, has a shape
+                    (num_points, num_instances) of type bool.
+        """
+
+        coordinates, features, inverse_mapping, spatial_shape = self.collate(
+            batch_inputs_dict['points'])
+        x = spconv.SparseConvTensor(
+            features, coordinates, spatial_shape, len(batch_data_samples))
+
+        x = self.extract_feat(x)
+
+        x = self.decoder(x)
+
+        results_list = self.predict_by_feat(x, inverse_mapping)
+
+        for i, data_sample in enumerate(batch_data_samples):
+            data_sample.pred_pts_seg = results_list[i]
+        return batch_data_samples
+
+    def predict_by_feat(self, out, superpoints):
+        """Predict instance, semantic, and panoptic masks for a single scene.
+
+        Args:
+            out (Dict): Decoder output, each value is List of len 1. Keys:
+                `cls_preds` of shape (n_queries, n_instance_classes + 1),
+                `masks` of shape (n_queries, n_points),
+                `scores` of shape (n_queris, 1) or None.
+            superpoints (Tensor): of shape (n_raw_points,).
+
+        Returns:
+            List[PointData]: of len 1 with `pts_semantic_mask`,
+                `pts_instance_mask`, `instance_labels`, `instance_scores`.
+        """
+        pred_labels = out['cls_preds'][0]
+        pred_masks = out['masks'][0]
+        pred_scores = out['scores'][0]
+
+        inst_res = self.pred_inst(pred_masks[:-self.test_cfg.num_sem_cls, :],
+                                  pred_scores[:-self.test_cfg.num_sem_cls, :],
+                                  pred_labels[:-self.test_cfg.num_sem_cls, :],
+                                  superpoints, self.test_cfg.inst_score_thr)
+        sem_res = self.pred_sem(pred_masks[-self.test_cfg.num_sem_cls:, :],
+                                superpoints)
+        pan_res = self.pred_pan(pred_masks, pred_scores, pred_labels,
+                                superpoints)
+
+        pts_semantic_mask = [sem_res.cpu().numpy(), pan_res[0].cpu().numpy()]
+        pts_instance_mask = [inst_res[0].cpu().bool().numpy(),
+                             pan_res[1].cpu().numpy()]
+
+        return [
+            PointData(
+                pts_semantic_mask=pts_semantic_mask,
+                pts_instance_mask=pts_instance_mask,
+                instance_labels=inst_res[1].cpu().numpy(),
+                instance_scores=inst_res[2].cpu().numpy())]
+
+    def pred_inst(self, pred_masks, pred_scores, pred_labels,
+                  superpoints, score_threshold):
+        """Predict instance masks for a single scene.
+
+        Args:
+            pred_masks (Tensor): of shape (n_queries, n_points).
+            pred_scores (Tensor): of shape (n_queris, 1).
+            pred_labels (Tensor): of shape (n_queries, n_instance_classes + 1).
+            superpoints (Tensor): of shape (n_raw_points,).
+            score_threshold (float): minimal score for predicted object.
+
+        Returns:
+            Tuple:
+                Tensor: mask_preds of shape (n_preds, n_raw_points),
+                Tensor: labels of shape (n_preds,),
+                Tensor: scors of shape (n_preds,).
+        """
+        scores = F.softmax(pred_labels, dim=-1)[:, :-1]
+        scores *= pred_scores
+
+        labels = torch.arange(
+            self.num_classes,
+            device=scores.device).unsqueeze(0).repeat(
+                self.decoder.num_queries - self.test_cfg.num_sem_cls,
+                1).flatten(0, 1)
+        
+        scores, topk_idx = scores.flatten(0, 1).topk(
+            self.test_cfg.topk_insts, sorted=False)
+        labels = labels[topk_idx]
+
+        topk_idx = torch.div(topk_idx, self.num_classes, rounding_mode='floor')
+        mask_pred = pred_masks
+        mask_pred = mask_pred[topk_idx]
+        mask_pred_sigmoid = mask_pred.sigmoid()
+        if self.test_cfg.get('obj_normalization', None):
+            mask_pred_thr = mask_pred_sigmoid > \
+                self.test_cfg.obj_normalization_thr
+            mask_scores = (mask_pred_sigmoid * mask_pred_thr).sum(1) / \
+                (mask_pred_thr.sum(1) + 1e-6)
+            scores = scores * mask_scores
+
+        if self.test_cfg.get('nms', None):
+            kernel = self.test_cfg.matrix_nms_kernel
+            scores, labels, mask_pred_sigmoid, _ = mask_matrix_nms(
+                mask_pred_sigmoid, labels, scores, kernel=kernel)
+
+        mask_pred = mask_pred_sigmoid > self.test_cfg.sp_score_thr
+        mask_pred = mask_pred[:, superpoints]
+        # score_thr
+        score_mask = scores > score_threshold
+        scores = scores[score_mask]
+        labels = labels[score_mask]
+        mask_pred = mask_pred[score_mask]
+
+        # npoint_thr
+        mask_pointnum = mask_pred.sum(1)
+        npoint_mask = mask_pointnum > self.test_cfg.npoint_thr
+        scores = scores[npoint_mask]
+        labels = labels[npoint_mask]
+        mask_pred = mask_pred[npoint_mask]
+
+        return mask_pred, labels, scores
+   
+    def pred_sem(self, pred_masks, superpoints):
+        """Predict semantic masks for a single scene.
+
+        Args:
+            pred_masks (Tensor): of shape (n_points, n_semantic_classes).
+            superpoints (Tensor): of shape (n_raw_points,).        
+
+        Returns:
+            Tensor: semantic preds of shape
+                (n_raw_points, 1).
+        """
+        mask_pred = pred_masks.sigmoid()
+        mask_pred = mask_pred[:, superpoints]
+        seg_map = mask_pred.argmax(0)
+        return seg_map
+
+    def pred_pan(self, pred_masks, pred_scores, pred_labels,
+                 superpoints):
+        """Predict panoptic masks for a single scene.
+        
+        Args:
+            pred_masks (Tensor): of shape (n_queries, n_points).
+            pred_scores (Tensor): of shape (n_queris, 1).
+            pred_labels (Tensor): of shape (n_queries, n_instance_classes + 1).
+            superpoints (Tensor): of shape (n_raw_points,).
+        
+        Returns:
+            Tuple:
+                Tensor: semantic mask of shape (n_raw_points,),
+                Tensor: instance mask of shape (n_raw_points,).
+        """
+        stuff_cls = pred_masks.new_tensor(self.test_cfg.stuff_cls).long()
+        sem_map = self.pred_sem(
+            pred_masks[-self.test_cfg.num_sem_cls + stuff_cls, :], superpoints)
+        sem_map_src_mapping = stuff_cls[sem_map]
+
+        n_cls = self.test_cfg.num_sem_cls
+        thr = self.test_cfg.pan_score_thr
+        mask_pred, labels, scores = self.pred_inst(
+            pred_masks[:-n_cls, :], pred_scores[:-n_cls, :],
+            pred_labels[:-n_cls, :], superpoints, thr)
+        
+        thing_idxs = torch.zeros_like(labels)
+        for thing_cls in self.test_cfg.thing_cls:
+            thing_idxs = thing_idxs.logical_or(labels == thing_cls)
+        
+        mask_pred = mask_pred[thing_idxs]
+        scores = scores[thing_idxs]
+        labels = labels[thing_idxs]
+
+        if mask_pred.shape[0] == 0:
+            return sem_map_src_mapping, sem_map
+
+        scores, idxs = scores.sort()
+        labels = labels[idxs]
+        mask_pred = mask_pred[idxs]
+
+        inst_idxs = torch.arange(
+            0, mask_pred.shape[0], device=mask_pred.device).view(-1, 1)
+        insts = inst_idxs * mask_pred
+        things_inst_mask, idxs = insts.max(axis=0)
+        things_sem_mask = labels[idxs]
+
+        inst_idxs, num_pts = things_inst_mask.unique(return_counts=True)
+        for inst, pts in zip(inst_idxs, num_pts):
+            if pts <= self.test_cfg.npoint_thr and inst != 0:
+                things_inst_mask[things_inst_mask == inst] = 0
+
+        things_inst_mask = torch.unique(
+            things_inst_mask, return_inverse=True)[1]
+        things_inst_mask[things_inst_mask != 0] += len(stuff_cls) - 1
+        things_sem_mask[things_inst_mask == 0] = 0
+      
+        sem_map_src_mapping[things_inst_mask != 0] = 0
+        sem_map[things_inst_mask != 0] = 0
+        sem_map += things_inst_mask
+        sem_map_src_mapping += things_sem_mask
+        return sem_map_src_mapping, sem_map
+
+    @staticmethod
+    def get_gt_semantic_masks(mask_src, sp_pts_mask, num_classes):    
+        """Create ground truth semantic masks.
+        
+        Args:
+            mask_src (Tensor): of shape (n_raw_points, 1).
+            sp_pts_mask (Tensor): of shape (n_raw_points, 1).
+            num_classes (Int): number of classes.
+        
+        Returns:
+            sp_masks (Tensor): semantic mask of shape (n_points, num_classes).
+        """
+
+        mask = torch.nn.functional.one_hot(
+            mask_src, num_classes=num_classes + 1)
+
+        mask = mask.T
+        sp_masks = scatter_mean(mask.float(), sp_pts_mask, dim=-1)
+        sp_masks = sp_masks > 0.5
+        sp_masks[-1, sp_masks.sum(axis=0) == 0] = True
+        assert sp_masks.sum(axis=0).max().item() == 1
+
+        return sp_masks
+
+    @staticmethod
+    def get_gt_inst_masks(mask_src, sp_pts_mask):
+        """Create ground truth instance masks.
+        
+        Args:
+            mask_src (Tensor): of shape (n_raw_points, 1).
+            sp_pts_mask (Tensor): of shape (n_raw_points, 1).
+        
+        Returns:
+            sp_masks (Tensor): semantic mask of shape (n_points, num_inst_obj).
+        """
+        mask = mask_src.clone()
+        if torch.sum(mask == -1) != 0:
+            mask[mask == -1] = torch.max(mask) + 1
+            mask = torch.nn.functional.one_hot(mask)[:, :-1]
+        else:
+            mask = torch.nn.functional.one_hot(mask)
+
+        mask = mask.T
+        sp_masks = scatter_mean(mask, sp_pts_mask, dim=-1)
+        sp_masks = sp_masks > 0.5
+
+        return sp_masks
+
+
+@MODELS.register_module()
+class InstanceOnlyOneFormer3D(Base3DDetector):
+    r"""InstanceOnlyOneFormer3D for training on different datasets jointly.
+
+    Args:
+        in_channels (int): Number of input channels.
+        num_channels (int): Number of output channels.
+        voxel_size (float): Voxel size.
+        num_classes_1dataset (int): Number of classes in the first dataset.
+        num_classes_2dataset (int): Number of classes in the second dataset.
+        prefix_1dataset (string): Prefix for the first dataset.
+        prefix_2dataset (string): Prefix for the second dataset.
+        min_spatial_shape (int): Minimal shape for spconv tensor.
+        backbone (ConfigDict): Config dict of the backbone.
+        decoder (ConfigDict): Config dict of the decoder.
+        criterion (ConfigDict): Config dict of the criterion.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or ConfigDict, optional): the config to control the
+            initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_channels,
+                 voxel_size,
+                 num_classes_1dataset,
+                 num_classes_2dataset,
+                 prefix_1dataset,
+                 prefix_2dataset,
+                 min_spatial_shape,
+                 backbone=None,
+                 decoder=None,
+                 criterion=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 data_preprocessor=None,
+                 init_cfg=None):
+        super(InstanceOnlyOneFormer3D, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.num_classes_1dataset = num_classes_1dataset 
+        self.num_classes_2dataset = num_classes_2dataset
+        
+        self.prefix_1dataset = prefix_1dataset 
+        self.prefix_2dataset = prefix_2dataset
+        
+        self.unet = MODELS.build(backbone)
+        self.decoder = MODELS.build(decoder)
+        self.criterion = MODELS.build(criterion)
+        self.voxel_size = voxel_size
+        self.min_spatial_shape = min_spatial_shape
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers(in_channels, num_channels)
+    
+    def _init_layers(self, in_channels, num_channels):
+        self.input_conv = spconv.SparseSequential(
+            spconv.SubMConv3d(
+                in_channels,
+                num_channels,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+                indice_key='subm1'))
+        self.output_layer = spconv.SparseSequential(
+            torch.nn.BatchNorm1d(num_channels, eps=1e-4, momentum=0.1),
+            torch.nn.ReLU(inplace=True))
+
+    def extract_feat(self, x):
+        """Extract features from sparse tensor.
+
+        Args:
+            x (SparseTensor): Input sparse tensor of shape
+                (n_points, in_channels).
+
+        Returns:
+            List[Tensor]: of len batch_size,
+                each of shape (n_points_i, n_channels).
+        """
+        x = self.input_conv(x)
+        x, _ = self.unet(x)
+        x = self.output_layer(x)
+        out = []
+        for i in x.indices[:, 0].unique():
+            out.append(x.features[x.indices[:, 0] == i])
+        return out
+
+    def collate(self, points, elastic_points=None):
+        """Collate batch of points to sparse tensor.
+
+        Args:
+            points (List[Tensor]): Batch of points.
+            quantization_mode (SparseTensorQuantizationMode): Minkowski
+                quantization mode. We use random sample for training
+                and unweighted average for inference.
+
+        Returns:
+            TensorField: Containing features and coordinates of a
+                sparse tensor.
+        """
+        if elastic_points is None:
+            coordinates, features = ME.utils.batch_sparse_collate(
+                [((p[:, :3] - p[:, :3].min(0)[0]) / self.voxel_size,
+                  torch.hstack((p[:, 3:], p[:, :3] - p[:, :3].mean(0))))
+                 for p in points])
+        else:
+            coordinates, features = ME.utils.batch_sparse_collate(
+                [((el_p - el_p.min(0)[0]),
+                  torch.hstack((p[:, 3:], p[:, :3] - p[:, :3].mean(0))))
+                 for el_p, p in zip(elastic_points, points)])
+        
+        spatial_shape = torch.clip(
+            coordinates.max(0)[0][1:] + 1, self.min_spatial_shape)
+        field = ME.TensorField(features=features, coordinates=coordinates)
+        tensor = field.sparse()
+        coordinates = tensor.coordinates
+        features = tensor.features
+        inverse_mapping = field.inverse_mapping(tensor.coordinate_map_key)
+
+        return coordinates, features, inverse_mapping, spatial_shape
+
+    def _forward(*args, **kwargs):
+        """Implement abstract method of Base3DDetector."""
+        pass
+
+    def loss(self, batch_inputs_dict, batch_data_samples, **kwargs):
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `points` key.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It includes information such as
+                `gt_instances_3d` and `gt_sem_seg_3d`.
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        
+        coordinates, features, inverse_mapping, spatial_shape = self.collate(
+            batch_inputs_dict['points'],
+            batch_inputs_dict.get('elastic_coords', None))
+        x = spconv.SparseConvTensor(
+            features, coordinates, spatial_shape, len(batch_data_samples))
+
+        x = self.extract_feat(x)
+
+        scene_names = []
+        for i in range(len(batch_data_samples)):
+           scene_names.append(batch_data_samples[i].lidar_path)
+        x = self.decoder(x, scene_names)
+
+        sp_gt_instances = []
+        for i in range(len(batch_data_samples)):
+            voxel_superpoints = inverse_mapping[
+                coordinates[:, 0][inverse_mapping] == i]
+            voxel_superpoints = torch.unique(
+                voxel_superpoints, return_inverse=True)[1]
+            inst_mask = batch_data_samples[i].gt_pts_seg.pts_instance_mask
+            assert voxel_superpoints.shape == inst_mask.shape
+
+            batch_data_samples[i].gt_instances_3d.sp_masks = \
+                S3DISOneFormer3D.get_gt_inst_masks(inst_mask, voxel_superpoints)
+            sp_gt_instances.append(batch_data_samples[i].gt_instances_3d)
+
+        loss = self.criterion(x, sp_gt_instances)
+        return loss
+    
+    def predict(self, batch_inputs_dict, batch_data_samples, **kwargs):
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `points` key.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It includes information such as
+                `gt_instance_3d` and `gt_sem_seg_3d`.
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input samples. Each Det3DDataSample contains 'pred_pts_seg'.
+            And the `pred_pts_seg` contains following keys.
+                - instance_scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - instance_labels (Tensor): Labels of instances, has a shape
+                    (num_instances, )
+                - pts_instance_mask (Tensor): Instance mask, has a shape
+                    (num_points, num_instances) of type bool.
+        """
+        
+        coordinates, features, inverse_mapping, spatial_shape = self.collate(
+            batch_inputs_dict['points'])
+        x = spconv.SparseConvTensor(
+            features, coordinates, spatial_shape, len(batch_data_samples))
+
+        x = self.extract_feat(x)
+
+        scene_names = []
+        for i in range(len(batch_data_samples)):
+            scene_names.append(batch_data_samples[i].lidar_path)
+        x = self.decoder(x, scene_names)
+
+        results_list = self.predict_by_feat(x, inverse_mapping, scene_names)
+
+        for i, data_sample in enumerate(batch_data_samples):
+            data_sample.pred_pts_seg = results_list[i]
+        return batch_data_samples
+
+    def predict_by_feat(self, out, superpoints, scene_names):
+        """Predict instance masks for a single scene.
+
+        Args:
+            out (Dict): Decoder output, each value is List of len 1. Keys:
+                `cls_preds` of shape (n_queries, n_instance_classes + 1),
+                `masks` of shape (n_queries, n_points),
+                `scores` of shape (n_queris, 1) or None.
+            superpoints (Tensor): of shape (n_raw_points,).
+            scene_names (List[string]): of len 1, which contain scene name.
+
+        Returns:
+            List[PointData]: of len 1 with `pts_instance_mask`, 
+                `instance_labels`, `instance_scores`.
+        """
+        pred_labels = out['cls_preds']
+        pred_masks = out['masks']
+        pred_scores = out['scores']
+        scene_name = scene_names[0]
+
+        scores = F.softmax(pred_labels[0], dim=-1)[:, :-1]
+        scores *= pred_scores[0]
+
+        if self.prefix_1dataset in scene_name:
+            labels = torch.arange(
+                self.num_classes_1dataset,
+                device=scores.device).unsqueeze(0).repeat(
+                    self.decoder.num_queries_1dataset,  
+                    1).flatten(0, 1)
+        elif self.prefix_2dataset in scene_name:
+            labels = torch.arange(
+                self.num_classes_2dataset,
+                device=scores.device).unsqueeze(0).repeat(
+                    self.decoder.num_queries_2dataset,
+                    1).flatten(0, 1)          
+        else:
+            raise RuntimeError(f'Invalid scene name "{scene_name}".')
+        
+        scores, topk_idx = scores.flatten(0, 1).topk(
+            self.test_cfg.topk_insts, sorted=False)
+        labels = labels[topk_idx]
+
+        if self.prefix_1dataset in scene_name:
+            topk_idx = torch.div(topk_idx, self.num_classes_1dataset, 
+                                 rounding_mode='floor')
+        elif self.prefix_2dataset in scene_name:
+            topk_idx = torch.div(topk_idx, self.num_classes_2dataset,
+                                 rounding_mode='floor')        
+        else:
+            raise RuntimeError(f'Invalid scene name "{scene_name}".')
+        
+        mask_pred = pred_masks[0]
+        mask_pred = mask_pred[topk_idx]
+        mask_pred_sigmoid = mask_pred.sigmoid()
+        if self.test_cfg.get('obj_normalization', None):
+            mask_pred_thr = mask_pred_sigmoid > \
+                self.test_cfg.obj_normalization_thr
+            mask_scores = (mask_pred_sigmoid * mask_pred_thr).sum(1) / \
+                (mask_pred_thr.sum(1) + 1e-6)
+            scores = scores * mask_scores
+
+        if self.test_cfg.get('nms', None):
+            kernel = self.test_cfg.matrix_nms_kernel
+            scores, labels, mask_pred_sigmoid, _ = mask_matrix_nms(
+                mask_pred_sigmoid, labels, scores, kernel=kernel)
+
+        mask_pred = mask_pred_sigmoid > self.test_cfg.sp_score_thr
+        mask_pred = mask_pred[:, superpoints]
+        # score_thr
+        score_mask = scores > self.test_cfg.score_thr
+        scores = scores[score_mask]
+        labels = labels[score_mask]
+        mask_pred = mask_pred[score_mask]
+
+        # npoint_thr
+        mask_pointnum = mask_pred.sum(1)
+        npoint_mask = mask_pointnum > self.test_cfg.npoint_thr
+        scores = scores[npoint_mask]
+        labels = labels[npoint_mask]
+        mask_pred = mask_pred[npoint_mask]
+
+        return [
+            PointData(
+                pts_instance_mask=mask_pred,
+                instance_labels=labels,
+                instance_scores=scores)
+        ]
diff --git a/oneformer3d/query_decoder.py b/oneformer3d/query_decoder.py
new file mode 100644
index 0000000..b0cb9ad
--- /dev/null
+++ b/oneformer3d/query_decoder.py
@@ -0,0 +1,718 @@
+import torch
+import torch.nn as nn
+
+from mmengine.model import BaseModule
+from mmdet3d.registry import MODELS
+
+
+class CrossAttentionLayer(BaseModule):
+    """Cross attention layer.
+
+    Args:
+        d_model (int): Model dimension.
+        num_heads (int): Number of heads.
+        dropout (float): Dropout rate.
+    """
+
+    def __init__(self, d_model, num_heads, dropout, fix=False):
+        super().__init__()
+        self.fix = fix
+        self.attn = nn.MultiheadAttention(
+            d_model, num_heads, dropout=dropout, batch_first=True)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        # todo: why BaseModule doesn't call it without us?
+        self.init_weights()
+
+    def init_weights(self):
+        """Init weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, sources, queries, attn_masks=None):
+        """Forward pass.
+
+        Args:
+            sources (List[Tensor]): of len batch_size,
+                each of shape (n_points_i, d_model).
+            queries (List[Tensor]): of len batch_size,
+                each of shape(n_queries_i, d_model).
+            attn_masks (List[Tensor] or None): of len batch_size,
+                each of shape (n_queries, n_points).
+        
+        Return:
+            List[Tensor]: Queries of len batch_size,
+                each of shape(n_queries_i, d_model).
+        """
+        outputs = []
+        for i in range(len(sources)):
+            k = v = sources[i]
+            attn_mask = attn_masks[i] if attn_masks is not None else None
+            output, _ = self.attn(queries[i], k, v, attn_mask=attn_mask)
+            if self.fix:
+                output = self.dropout(output)
+            output = output + queries[i]
+            if self.fix:
+                output = self.norm(output)
+            outputs.append(output)
+        return outputs
+
+
+class SelfAttentionLayer(BaseModule):
+    """Self attention layer.
+
+    Args:
+        d_model (int): Model dimension.
+        num_heads (int): Number of heads.
+        dropout (float): Dropout rate.
+    """
+
+    def __init__(self, d_model, num_heads, dropout):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(
+            d_model, num_heads, dropout=dropout, batch_first=True)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (List[Tensor]): Queries of len batch_size,
+                each of shape(n_queries_i, d_model).
+        
+        Returns:
+            List[Tensor]: Queries of len batch_size,
+                each of shape(n_queries_i, d_model).
+        """
+        out = []
+        for y in x:
+            z, _ = self.attn(y, y, y)
+            z = self.dropout(z) + y
+            z = self.norm(z)
+            out.append(z)
+        return out
+
+
+class FFN(BaseModule):
+    """Feed forward network.
+
+    Args:
+        d_model (int): Model dimension.
+        hidden_dim (int): Hidden dimension.
+        dropout (float): Dropout rate.
+        activation_fn (str): 'relu' or 'gelu'.
+    """
+
+    def __init__(self, d_model, hidden_dim, dropout, activation_fn):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(d_model, hidden_dim),
+            nn.ReLU() if activation_fn == 'relu' else nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, d_model),
+            nn.Dropout(dropout))
+        self.norm = nn.LayerNorm(d_model)
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (List[Tensor]): Queries of len batch_size,
+                each of shape(n_queries_i, d_model).
+        
+        Returns:
+            List[Tensor]: Queries of len batch_size,
+                each of shape(n_queries_i, d_model).
+        """
+        out = []
+        for y in x:
+            z = self.net(y)
+            z = z + y
+            z = self.norm(z)
+            out.append(z)
+        return out
+
+@MODELS.register_module()
+class QueryDecoder(BaseModule):
+    """Query decoder.
+
+    Args:
+        num_layers (int): Number of transformer layers.
+        num_instance_queries (int): Number of instance queries.
+        num_semantic_queries (int): Number of semantic queries.
+        num_classes (int): Number of classes.
+        in_channels (int): Number of input channels.
+        d_model (int): Number of channels for model layers.
+        num_heads (int): Number of head in attention layer.
+        hidden_dim (int): Dimension of attention layer.
+        dropout (float): Dropout rate for transformer layer.
+        activation_fn (str): 'relu' of 'gelu'.
+        iter_pred (bool): Whether to predict iteratively.
+        attn_mask (bool): Whether to use mask attention.
+        pos_enc_flag (bool): Whether to use positional enconding.
+    """
+
+    def __init__(self, num_layers, num_instance_queries, num_semantic_queries,
+                 num_classes, in_channels, d_model, num_heads, hidden_dim,
+                 dropout, activation_fn, iter_pred, attn_mask, fix_attention,
+                 objectness_flag, **kwargs):
+        super().__init__()
+        self.objectness_flag = objectness_flag
+        self.input_proj = nn.Sequential(
+            nn.Linear(in_channels, d_model), nn.LayerNorm(d_model), nn.ReLU())
+        self.num_queries = num_instance_queries + num_semantic_queries
+        if num_instance_queries + num_semantic_queries > 0:
+            self.query = nn.Embedding(num_instance_queries + num_semantic_queries, d_model)
+        if num_instance_queries == 0:
+            self.query_proj = nn.Sequential(
+                nn.Linear(in_channels, d_model), nn.ReLU(),
+                nn.Linear(d_model, d_model))
+        self.cross_attn_layers = nn.ModuleList([])
+        self.self_attn_layers = nn.ModuleList([])
+        self.ffn_layers = nn.ModuleList([])
+        for i in range(num_layers):
+            self.cross_attn_layers.append(
+                CrossAttentionLayer(
+                    d_model, num_heads, dropout, fix_attention))
+            self.self_attn_layers.append(
+                SelfAttentionLayer(d_model, num_heads, dropout))
+            self.ffn_layers.append(
+                FFN(d_model, hidden_dim, dropout, activation_fn))
+        self.out_norm = nn.LayerNorm(d_model)
+        self.out_cls = nn.Sequential(
+            nn.Linear(d_model, d_model), nn.ReLU(),
+            nn.Linear(d_model, num_classes + 1))
+        if objectness_flag:
+            self.out_score = nn.Sequential(
+                nn.Linear(d_model, d_model), nn.ReLU(), nn.Linear(d_model, 1))
+        self.x_mask = nn.Sequential(
+            nn.Linear(in_channels, d_model), nn.ReLU(),
+            nn.Linear(d_model, d_model))
+        self.iter_pred = iter_pred
+        self.attn_mask = attn_mask
+    
+    def _get_queries(self, queries=None, batch_size=None):
+        """Get query tensor.
+
+        Args:
+            queries (List[Tensor], optional): of len batch_size,
+                each of shape (n_queries_i, in_channels).
+            batch_size (int, optional): batch size.
+        
+        Returns:
+            List[Tensor]: of len batch_size, each of shape
+                (n_queries_i, d_model).
+        """
+        if batch_size is None:
+            batch_size = len(queries)
+        
+        result_queries = []
+        for i in range(batch_size):
+            result_query = []
+            if hasattr(self, 'query'):
+                result_query.append(self.query.weight)
+            if queries is not None:
+                result_query.append(self.query_proj(queries[i]))
+            result_queries.append(torch.cat(result_query))
+        return result_queries
+
+    def _forward_head(self, queries, mask_feats):
+        """Prediction head forward.
+
+        Args:
+            queries (List[Tensor] | Tensor): List of len batch_size,
+                each of shape (n_queries_i, d_model). Or tensor of
+                shape (batch_size, n_queries, d_model).
+            mask_feats (List[Tensor]): of len batch_size,
+                each of shape (n_points_i, d_model).
+
+        Returns:
+            Tuple:
+                List[Tensor]: Classification predictions of len batch_size,
+                    each of shape (n_queries_i, n_classes + 1).
+                List[Tensor]: Confidence scores of len batch_size,
+                    each of shape (n_queries_i, 1).
+                List[Tensor]: Predicted masks of len batch_size,
+                    each of shape (n_queries_i, n_points_i).
+                List[Tensor] or None: Attention masks of len batch_size,
+                    each of shape (n_queries_i, n_points_i).
+        """
+        cls_preds, pred_scores, pred_masks, attn_masks = [], [], [], []
+        for i in range(len(queries)):
+            norm_query = self.out_norm(queries[i])
+            cls_preds.append(self.out_cls(norm_query))
+            pred_score = self.out_score(norm_query) if self.objectness_flag \
+                else None
+            pred_scores.append(pred_score)
+            pred_mask = torch.einsum('nd,md->nm', norm_query, mask_feats[i])
+            if self.attn_mask:
+                attn_mask = (pred_mask.sigmoid() < 0.5).bool()
+                attn_mask[torch.where(
+                    attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+                attn_mask = attn_mask.detach()
+                attn_masks.append(attn_mask)
+            pred_masks.append(pred_mask)
+        attn_masks = attn_masks if self.attn_mask else None
+        return cls_preds, pred_scores, pred_masks, attn_masks
+
+    def forward_simple(self, x, queries):
+        """Simple forward pass.
+        
+        Args:
+            x (List[Tensor]): of len batch_size, each of shape
+                (n_points_i, in_channels).
+            queries (List[Tensor], optional): of len batch_size, each of shape
+                (n_points_i, in_channles).
+        
+        Returns:
+            Dict: with labels, masks, and scores.
+        """
+        inst_feats = [self.input_proj(y) for y in x]
+        mask_feats = [self.x_mask(y) for y in x]
+        queries = self._get_queries(queries, len(x))
+        for i in range(len(self.cross_attn_layers)):
+            queries = self.cross_attn_layers[i](inst_feats, queries)
+            queries = self.self_attn_layers[i](queries)
+            queries = self.ffn_layers[i](queries)
+        cls_preds, pred_scores, pred_masks, _ = self._forward_head(
+            queries, mask_feats)
+        return dict(
+            cls_preds=cls_preds,
+            masks=pred_masks,
+            scores=pred_scores)
+
+    def forward_iter_pred(self, x, queries):
+        """Iterative forward pass.
+        
+        Args:
+            x (List[Tensor]): of len batch_size, each of shape
+                (n_points_i, in_channels).
+            queries (List[Tensor], optional): of len batch_size, each of shape
+                (n_points_i, in_channles).
+        
+        Returns:
+            Dict: with labels, masks, scores, and aux_outputs.
+        """
+        cls_preds, pred_scores, pred_masks = [], [], []
+        inst_feats = [self.input_proj(y) for y in x]
+        mask_feats = [self.x_mask(y) for y in x]
+        queries = self._get_queries(queries, len(x))
+        cls_pred, pred_score, pred_mask, attn_mask = self._forward_head(
+            queries, mask_feats)
+        cls_preds.append(cls_pred)
+        pred_scores.append(pred_score)
+        pred_masks.append(pred_mask)
+        for i in range(len(self.cross_attn_layers)):
+            queries = self.cross_attn_layers[i](inst_feats, queries, attn_mask)
+            queries = self.self_attn_layers[i](queries)
+            queries = self.ffn_layers[i](queries)
+            cls_pred, pred_score, pred_mask, attn_mask = self._forward_head(
+                queries, mask_feats)
+            cls_preds.append(cls_pred)
+            pred_scores.append(pred_score)
+            pred_masks.append(pred_mask)
+
+        aux_outputs = [
+            {'cls_preds': cls_pred, 'masks': masks, 'scores': scores}
+            for cls_pred, scores, masks in zip(
+                cls_preds[:-1], pred_scores[:-1], pred_masks[:-1])]
+        return dict(
+            cls_preds=cls_preds[-1],
+            masks=pred_masks[-1],
+            scores=pred_scores[-1],
+            aux_outputs=aux_outputs)
+
+    def forward(self, x, queries=None):
+        """Forward pass.
+        
+        Args:
+            x (List[Tensor]): of len batch_size, each of shape
+                (n_points_i, in_channels).
+            queries (List[Tensor], optional): of len batch_size, each of shape
+                (n_points_i, in_channles).
+        
+        Returns:
+            Dict: with labels, masks, scores, and possibly aux_outputs.
+        """
+        if self.iter_pred:
+            return self.forward_iter_pred(x, queries)
+        else:
+            return self.forward_simple(x, queries)
+
+
+@MODELS.register_module()
+class ScanNetQueryDecoder(QueryDecoder):
+    """We simply add semantic prediction for each instance query.
+    """
+    def __init__(self, num_instance_classes, num_semantic_classes,
+                 d_model, num_semantic_linears, **kwargs):
+        super().__init__(
+            num_classes=num_instance_classes, d_model=d_model, **kwargs)
+        assert num_semantic_linears in [1, 2]
+        if num_semantic_linears == 2:
+            self.out_sem = nn.Sequential(
+                nn.Linear(d_model, d_model), nn.ReLU(),
+                nn.Linear(d_model, num_semantic_classes + 1))
+        else:
+            self.out_sem = nn.Linear(d_model, num_semantic_classes + 1)
+
+    def _forward_head(self, queries, mask_feats, last_flag):
+        """Prediction head forward.
+
+        Args:
+            queries (List[Tensor] | Tensor): List of len batch_size,
+                each of shape (n_queries_i, d_model). Or tensor of
+                shape (batch_size, n_queries, d_model).
+            mask_feats (List[Tensor]): of len batch_size,
+                each of shape (n_points_i, d_model).
+
+        Returns:
+            Tuple:
+                List[Tensor]: Classification predictions of len batch_size,
+                    each of shape (n_queries_i, n_instance_classes + 1).
+                List[Tensor] or None: Semantic predictions of len batch_size,
+                    each of shape (n_queries_i, n_semantic_classes + 1).
+                List[Tensor]: Confidence scores of len batch_size,
+                    each of shape (n_queries_i, 1).
+                List[Tensor]: Predicted masks of len batch_size,
+                    each of shape (n_queries_i, n_points_i).
+                List[Tensor] or None: Attention masks of len batch_size,
+                    each of shape (n_queries_i, n_points_i).
+        """
+        cls_preds, sem_preds, pred_scores, pred_masks, attn_masks = \
+            [], [], [], [], []
+        for i in range(len(queries)):
+            norm_query = self.out_norm(queries[i])
+            cls_preds.append(self.out_cls(norm_query))
+            if last_flag:
+                sem_preds.append(self.out_sem(norm_query))
+            pred_score = self.out_score(norm_query) if self.objectness_flag \
+                else None
+            pred_scores.append(pred_score)
+            pred_mask = torch.einsum('nd,md->nm', norm_query, mask_feats[i])
+            if self.attn_mask:
+                attn_mask = (pred_mask.sigmoid() < 0.5).bool()
+                attn_mask[torch.where(
+                    attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+                attn_mask = attn_mask.detach()
+                attn_masks.append(attn_mask)
+            pred_masks.append(pred_mask)
+        attn_masks = attn_masks if self.attn_mask else None
+        sem_preds = sem_preds if last_flag else None
+        return cls_preds, sem_preds, pred_scores, pred_masks, attn_masks
+
+    def forward_simple(self, x, queries):
+        """Simple forward pass.
+        
+        Args:
+            x (List[Tensor]): of len batch_size, each of shape
+                (n_points_i, in_channels).
+            queries (List[Tensor], optional): of len batch_size, each of shape
+                (n_points_i, in_channles).
+        
+        Returns:
+            Dict: with instance scores, semantic scores, masks, and scores.
+        """
+        inst_feats = [self.input_proj(y) for y in x]
+        mask_feats = [self.x_mask(y) for y in x]
+        queries = self._get_queries(queries, len(x))
+        for i in range(len(self.cross_attn_layers)):
+            queries = self.cross_attn_layers[i](inst_feats, queries)
+            queries = self.self_attn_layers[i](queries)
+            queries = self.ffn_layers[i](queries)
+        cls_preds, sem_preds, pred_scores, pred_masks, _ = self._forward_head(
+            queries, mask_feats, last_flag=True)
+        return dict(
+            cls_preds=cls_preds,
+            sem_preds=sem_preds,
+            masks=pred_masks,
+            scores=pred_scores)
+
+    def forward_iter_pred(self, x, queries):
+        """Iterative forward pass.
+        
+        Args:
+            x (List[Tensor]): of len batch_size, each of shape
+                (n_points_i, in_channels).
+            queries (List[Tensor], optional): of len batch_size, each of shape
+                (n_points_i, in_channles).
+        
+        Returns:
+            Dict: with instance scores, semantic scores, masks, scores,
+                and aux_outputs.
+        """
+        cls_preds, sem_preds, pred_scores, pred_masks = [], [], [], []
+        inst_feats = [self.input_proj(y) for y in x]
+        mask_feats = [self.x_mask(y) for y in x]
+        queries = self._get_queries(queries, len(x))
+        cls_pred, sem_pred, pred_score, pred_mask, attn_mask = \
+            self._forward_head(queries, mask_feats, last_flag=False)
+        cls_preds.append(cls_pred)
+        sem_preds.append(sem_pred)
+        pred_scores.append(pred_score)
+        pred_masks.append(pred_mask)
+        for i in range(len(self.cross_attn_layers)):
+            queries = self.cross_attn_layers[i](inst_feats, queries, attn_mask)
+            queries = self.self_attn_layers[i](queries)
+            queries = self.ffn_layers[i](queries)
+            last_flag = i == len(self.cross_attn_layers) - 1
+            cls_pred, sem_pred, pred_score, pred_mask, attn_mask = \
+                self._forward_head(queries, mask_feats, last_flag)
+            cls_preds.append(cls_pred)
+            sem_preds.append(sem_pred)
+            pred_scores.append(pred_score)
+            pred_masks.append(pred_mask)
+
+        aux_outputs = [
+            dict(
+                cls_preds=cls_pred,
+                sem_preds=sem_pred,
+                masks=masks,
+                scores=scores)
+            for cls_pred, sem_pred, scores, masks in zip(
+                cls_preds[:-1], sem_preds[:-1],
+                pred_scores[:-1], pred_masks[:-1])]
+        return dict(
+            cls_preds=cls_preds[-1],
+            sem_preds=sem_preds[-1],
+            masks=pred_masks[-1],
+            scores=pred_scores[-1],
+            aux_outputs=aux_outputs)
+
+
+@MODELS.register_module()
+class OneDataQueryDecoder(BaseModule):
+    """Query decoder. The same as above, but for 2 datasets.
+
+    Args:
+        num_layers (int): Number of transformer layers.
+        num_queries_1dataset (int): Number of queries for the first dataset.
+        num_queries_2dataset (int): Number of queries for the second dataset.
+        num_classes_1dataset (int): Number of classes in the first dataset.
+        num_classes_2dataset (int): Number of classes in the second dataset.
+        prefix_1dataset (string): Prefix for the first dataset.
+        prefix_2dataset (string): Prefix for the second dataset.
+        in_channels (int): Number of input channels.
+        d_model (int): Number of channels for model layers.
+        num_heads (int): Number of head in attention layer.
+        hidden_dim (int): Dimension of attention layer.
+        dropout (float): Dropout rate for transformer layer.
+        activation_fn (str): 'relu' of 'gelu'.
+        iter_pred (bool): Whether to predict iteratively.
+        attn_mask (bool): Whether to use mask attention.
+        pos_enc_flag (bool): Whether to use positional enconding.
+    """
+
+    def __init__(self, 
+                 num_layers, 
+                 num_queries_1dataset, 
+                 num_queries_2dataset,
+                 num_classes_1dataset, 
+                 num_classes_2dataset,
+                 prefix_1dataset,
+                 prefix_2dataset,
+                 in_channels, 
+                 d_model, 
+                 num_heads, 
+                 hidden_dim,
+                 dropout, 
+                 activation_fn, 
+                 iter_pred, 
+                 attn_mask, 
+                 fix_attention, 
+                 **kwargs):
+        super().__init__()
+        self.input_proj = nn.Sequential(
+            nn.Linear(in_channels, d_model), nn.LayerNorm(d_model), nn.ReLU())
+
+        self.num_queries_1dataset = num_queries_1dataset
+        self.num_queries_2dataset = num_queries_2dataset
+
+        self.queries_1dataset = nn.Embedding(num_queries_1dataset, d_model)
+        self.queries_2dataset = nn.Embedding(num_queries_2dataset, d_model)
+        
+        self.prefix_1dataset = prefix_1dataset 
+        self.prefix_2dataset = prefix_2dataset
+
+        self.cross_attn_layers = nn.ModuleList([])
+        self.self_attn_layers = nn.ModuleList([])
+        self.ffn_layers = nn.ModuleList([])
+        for i in range(num_layers):
+            self.cross_attn_layers.append(
+                CrossAttentionLayer(
+                    d_model, num_heads, dropout, fix_attention))
+            self.self_attn_layers.append(
+                SelfAttentionLayer(d_model, num_heads, dropout))
+            self.ffn_layers.append(
+                FFN(d_model, hidden_dim, dropout, activation_fn))
+        self.out_norm = nn.LayerNorm(d_model)
+        self.out_cls_1dataset = nn.Sequential(
+            nn.Linear(d_model, d_model), nn.ReLU(),
+            nn.Linear(d_model, num_classes_1dataset + 1))
+        self.out_cls_2dataset = nn.Sequential(
+            nn.Linear(d_model, d_model), nn.ReLU(),
+            nn.Linear(d_model, num_classes_2dataset + 1))
+        self.out_score = nn.Sequential(
+                nn.Linear(d_model, d_model), nn.ReLU(), nn.Linear(d_model, 1))
+        self.x_mask = nn.Sequential(
+            nn.Linear(in_channels, d_model), nn.ReLU(),
+            nn.Linear(d_model, d_model))
+        self.iter_pred = iter_pred
+        self.attn_mask = attn_mask
+        self.num_classes_1dataset = num_classes_1dataset 
+        self.num_classes_2dataset = num_classes_2dataset
+
+    def _get_queries(self, batch_size, scene_names):
+        """Get query tensor.
+
+        Args:
+            batch_size (int, optional): batch size.
+            scene_names (List[string]): list of len batch size, which 
+                contains scene names.
+        Returns:
+            List[Tensor]: of len batch_size, each of shape
+                (n_queries_i, d_model).
+        """
+        
+        result_queries = []
+        for i in range(batch_size):
+            if self.prefix_1dataset in scene_names[i]:
+                result_queries.append(self.queries_1dataset.weight)
+            elif self.prefix_2dataset in scene_names[i]:
+                result_queries.append(self.queries_2dataset.weight)
+            else:
+                raise RuntimeError(f'Invalid scene name "{scene_names[i]}".')
+
+        return result_queries
+
+    def _forward_head(self, queries, mask_feats, scene_names):
+        """Prediction head forward.
+
+        Args:
+            queries (List[Tensor] | Tensor): List of len batch_size,
+                each of shape (n_queries_i, d_model). Or tensor of
+                shape (batch_size, n_queries, d_model).
+            mask_feats (List[Tensor]): of len batch_size,
+                each of shape (n_points_i, d_model).
+            scene_names (List[string]): list of len batch size, which 
+                contains scene names.
+
+        Returns:
+            Tuple:
+                List[Tensor]: Classification predictions of len batch_size,
+                    each of shape (n_queries_i, n_classes + 1).
+                List[Tensor]: Confidence scores of len batch_size,
+                    each of shape (n_queries_i, 1).
+                List[Tensor]: Predicted masks of len batch_size,
+                    each of shape (n_queries_i, n_points_i).
+                List[Tensor]: Attention masks of len batch_size,
+                    each of shape (n_queries_i, n_points_i).
+        """
+        cls_preds, pred_scores, pred_masks, attn_masks = [], [], [], []
+        for i in range(len(queries)):
+            norm_query = self.out_norm(queries[i])
+            
+            if self.prefix_1dataset in scene_names[i]:
+                cls_preds.append(self.out_cls_1dataset(norm_query))
+            elif self.prefix_2dataset in scene_names[i]:
+                cls_preds.append(self.out_cls_2dataset(norm_query))
+            else:
+                raise RuntimeError(f'Invalid scene name "{scene_names[i]}".')
+            
+
+            pred_scores.append(self.out_score(norm_query))
+            pred_mask = torch.einsum('nd,md->nm', norm_query, mask_feats[i])
+            if self.attn_mask:
+                attn_mask = (pred_mask.sigmoid() < 0.5).bool()
+                attn_mask[torch.where(
+                    attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+                attn_mask = attn_mask.detach()
+                attn_masks.append(attn_mask)
+            pred_masks.append(pred_mask)
+        return  cls_preds, pred_scores, pred_masks, attn_masks
+
+    def forward_simple(self, x, scene_names):
+        """Simple forward pass.
+        
+        Args:
+            x (List[Tensor]): of len batch_size, each of shape
+                (n_points_i, in_channels).
+            scene_names (List[string]): list of len batch size, which 
+                contains scene names.
+        
+        Returns:
+            Dict: with labels, masks, and scores.
+        """
+        inst_feats = [self.input_proj(y) for y in x]
+        mask_feats = [self.x_mask(y) for y in x]
+        queries = self._get_queries(len(x), scene_names)
+        for i in range(len(self.cross_attn_layers)):
+            queries = self.cross_attn_layers[i](inst_feats, queries)
+            queries = self.self_attn_layers[i](queries)
+            queries = self.ffn_layers[i](queries)
+        cls_preds, pred_scores, pred_masks, _ = self._forward_head(
+            queries, mask_feats, scene_names)
+        return dict(
+            cls_preds=cls_preds,
+            masks=pred_masks,
+            scores=pred_scores)
+
+    def forward_iter_pred(self, x, scene_names):
+        """Iterative forward pass.
+        
+        Args:
+            x (List[Tensor]): of len batch_size, each of shape
+                (n_points_i, in_channels).
+            scene_names (List[string]): list of len batch size, which 
+                contains scene names.
+        
+        Returns:
+            Dict: with labels, masks, scores, and aux_outputs.
+        """
+        cls_preds, pred_scores, pred_masks = [], [], []
+        inst_feats = [self.input_proj(y) for y in x]
+        mask_feats = [self.x_mask(y) for y in x]
+        queries = self._get_queries(len(x), scene_names)
+        cls_pred, pred_score, pred_mask, attn_mask = self._forward_head(
+            queries, mask_feats, scene_names)
+        cls_preds.append(cls_pred)
+        pred_scores.append(pred_score)
+        pred_masks.append(pred_mask)
+        for i in range(len(self.cross_attn_layers)):
+            queries = self.cross_attn_layers[i](inst_feats, queries, attn_mask)
+            queries = self.self_attn_layers[i](queries)
+            queries = self.ffn_layers[i](queries)
+            cls_pred, pred_score, pred_mask, attn_mask = self._forward_head(
+                queries, mask_feats, scene_names)
+            cls_preds.append(cls_pred)
+            pred_scores.append(pred_score)
+            pred_masks.append(pred_mask)
+
+        aux_outputs = [
+            {'cls_preds': cls_pred, 'masks': masks, 'scores': scores}
+            for cls_pred, scores, masks in zip(
+                cls_preds[:-1], pred_scores[:-1], pred_masks[:-1])]
+        return dict(
+            cls_preds=cls_preds[-1],
+            masks=pred_masks[-1],
+            scores=pred_scores[-1],
+            aux_outputs=aux_outputs)
+
+    def forward(self, x, scene_names):
+        """Forward pass.
+        
+        Args:
+            x (List[Tensor]): of len batch_size, each of shape
+                (n_points_i, in_channels).
+            scene_names (List[string]): list of len batch size, which 
+                contains scene names.
+        
+        Returns:
+            Dict: with labels, masks, scores, and possibly aux_outputs.
+        """
+        if self.iter_pred:
+            return self.forward_iter_pred(x, scene_names)
+        else:
+            return self.forward_simple(x, scene_names)
diff --git a/oneformer3d/s3dis_dataset.py b/oneformer3d/s3dis_dataset.py
new file mode 100644
index 0000000..7cc1ea4
--- /dev/null
+++ b/oneformer3d/s3dis_dataset.py
@@ -0,0 +1,19 @@
+from mmdet3d.registry import DATASETS
+from mmdet3d.datasets.s3dis_dataset import S3DISDataset
+
+
+@DATASETS.register_module()
+class S3DISSegDataset_(S3DISDataset):
+    METAINFO = {
+        'classes':
+        ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+         'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter'),
+        'palette': [[0, 255, 0], [0, 0, 255], [0, 255, 255], [255, 255, 0],
+                    [255, 0, 255], [100, 100, 255], [200, 200, 100],
+                    [170, 120, 200], [255, 0, 0], [200, 100, 100],
+                    [10, 200, 100], [200, 200, 200], [50, 50, 50]],
+        'seg_valid_class_ids':
+        tuple(range(13)),
+        'seg_all_class_ids':
+        tuple(range(14))  # possibly with 'stair' class
+    }
diff --git a/oneformer3d/scannet_dataset.py b/oneformer3d/scannet_dataset.py
new file mode 100644
index 0000000..535062c
--- /dev/null
+++ b/oneformer3d/scannet_dataset.py
@@ -0,0 +1,102 @@
+from os import path as osp
+import numpy as np
+import random
+
+from mmdet3d.datasets.scannet_dataset import ScanNetSegDataset
+from mmdet3d.registry import DATASETS
+
+
+@DATASETS.register_module()
+class ScanNetSegDataset_(ScanNetSegDataset):
+    """We just add super_pts_path."""
+
+    def get_scene_idxs(self, *args, **kwargs):
+        """Compute scene_idxs for data sampling."""
+        return np.arange(len(self)).astype(np.int32)
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        info['super_pts_path'] = osp.join(
+            self.data_prefix.get('sp_pts_mask', ''), info['super_pts_path'])
+
+        info = super().parse_data_info(info)
+
+        return info
+
+
+@DATASETS.register_module()
+class ScanNet200SegDataset_(ScanNetSegDataset_):
+    # IMPORTANT: the floor and chair categories are swapped.
+    METAINFO = {
+    'classes': ('wall', 'floor', 'chair', 'table', 'door', 'couch', 'cabinet',
+                'shelf', 'desk', 'office chair', 'bed', 'pillow', 'sink',
+                'picture', 'window', 'toilet', 'bookshelf', 'monitor',
+                'curtain', 'book', 'armchair', 'coffee table', 'box',
+                'refrigerator', 'lamp', 'kitchen cabinet', 'towel', 'clothes',
+                'tv', 'nightstand', 'counter', 'dresser', 'stool', 'cushion',
+                'plant', 'ceiling', 'bathtub', 'end table', 'dining table',
+                'keyboard', 'bag', 'backpack', 'toilet paper', 'printer',
+                'tv stand', 'whiteboard', 'blanket', 'shower curtain',
+                'trash can', 'closet', 'stairs', 'microwave', 'stove', 'shoe',
+                'computer tower', 'bottle', 'bin', 'ottoman', 'bench', 'board',
+                'washing machine', 'mirror', 'copier', 'basket', 'sofa chair',
+                'file cabinet', 'fan', 'laptop', 'shower', 'paper', 'person',
+                'paper towel dispenser', 'oven', 'blinds', 'rack', 'plate',
+                'blackboard', 'piano', 'suitcase', 'rail', 'radiator',
+                'recycling bin', 'container', 'wardrobe', 'soap dispenser',
+                'telephone', 'bucket', 'clock', 'stand', 'light',
+                'laundry basket', 'pipe', 'clothes dryer', 'guitar',
+                'toilet paper holder', 'seat', 'speaker', 'column', 'bicycle',
+                'ladder', 'bathroom stall', 'shower wall', 'cup', 'jacket',
+                'storage bin', 'coffee maker', 'dishwasher',
+                'paper towel roll', 'machine', 'mat', 'windowsill', 'bar',
+                'toaster', 'bulletin board', 'ironing board', 'fireplace',
+                'soap dish', 'kitchen counter', 'doorframe',
+                'toilet paper dispenser', 'mini fridge', 'fire extinguisher',
+                'ball', 'hat', 'shower curtain rod', 'water cooler',
+                'paper cutter', 'tray', 'shower door', 'pillar', 'ledge',
+                'toaster oven', 'mouse', 'toilet seat cover dispenser',
+                'furniture', 'cart', 'storage container', 'scale',
+                'tissue box', 'light switch', 'crate', 'power outlet',
+                'decoration', 'sign', 'projector', 'closet door',
+                'vacuum cleaner', 'candle', 'plunger', 'stuffed animal',
+                'headphones', 'dish rack', 'broom', 'guitar case',
+                'range hood', 'dustpan', 'hair dryer', 'water bottle',
+                'handicap bar', 'purse', 'vent', 'shower floor',
+                'water pitcher', 'mailbox', 'bowl', 'paper bag', 'alarm clock',
+                'music stand', 'projector screen', 'divider',
+                'laundry detergent', 'bathroom counter', 'object',
+                'bathroom vanity', 'closet wall', 'laundry hamper',
+                'bathroom stall door', 'ceiling light', 'trash bin',
+                'dumbbell', 'stair rail', 'tube', 'bathroom cabinet',
+                'cd case', 'closet rod', 'coffee kettle', 'structure',
+                'shower head', 'keyboard piano', 'case of water bottles',
+                'coat rack', 'storage organizer', 'folded chair', 'fire alarm',
+                'power strip', 'calendar', 'poster', 'potted plant', 'luggage',
+                'mattress'),
+    # the valid ids of segmentation annotations
+    'seg_valid_class_ids': (
+        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22,
+        23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44,
+        45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 62, 63, 64, 65,
+        66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86,
+        87, 88, 89, 90, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+        106, 107, 110, 112, 115, 116, 118, 120, 121, 122, 125, 128, 130, 131,
+        132, 134, 136, 138, 139, 140, 141, 145, 148, 154,155, 156, 157, 159,
+        161, 163, 165, 166, 168, 169, 170, 177, 180, 185, 188, 191, 193, 195,
+        202, 208, 213, 214, 221, 229, 230, 232, 233, 242, 250, 261, 264, 276,
+        283, 286, 300, 304, 312, 323, 325, 331, 342, 356, 370, 392, 395, 399,
+        408, 417, 488, 540, 562, 570, 572, 581, 609, 748, 776, 1156, 1163,
+        1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175,
+        1176, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188,
+        1189, 1190, 1191),
+    'seg_all_class_ids': tuple(range(1, 1358)),
+    'palette': [random.sample(range(0, 255), 3) for i in range(200)]}
diff --git a/oneformer3d/semantic_criterion.py b/oneformer3d/semantic_criterion.py
new file mode 100644
index 0000000..09f55d2
--- /dev/null
+++ b/oneformer3d/semantic_criterion.py
@@ -0,0 +1,116 @@
+import torch
+import torch.nn.functional as F
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class ScanNetSemanticCriterion:
+    """Semantic criterion for ScanNet.
+
+    Args:
+        ignore_index (int): Ignore index.
+        loss_weight (float): Loss weight.
+    """
+
+    def __init__(self, ignore_index, loss_weight):
+        self.ignore_index = ignore_index
+        self.loss_weight = loss_weight
+
+    def __call__(self, pred, insts):
+        """Calculate loss.
+
+        Args:
+            pred (dict): Predictions with List `sem_preds`
+                of len batch_size, each of shape
+                (n_queries_i, n_classes + 1).
+            insts (list): Ground truth of len batch_size, 
+                each InstanceData_ with `sp_masks` of shape
+                (n_classes + 1, n_queries_i).
+
+        Returns:
+            Dict: with semantic loss value.
+        """
+        losses = []
+        for pred_mask, gt_mask in zip(pred['sem_preds'], insts):
+            if self.ignore_index >= 0:
+                pred_mask = pred_mask[:, :-1]
+            losses.append(F.cross_entropy(
+                pred_mask,
+                gt_mask.sp_masks.float().argmax(0),
+                ignore_index=self.ignore_index))
+        loss = self.loss_weight * torch.mean(torch.stack(losses))
+        return dict(seg_loss=loss)
+
+
+@MODELS.register_module()
+class S3DISSemanticCriterion:
+    """Semantic criterion for S3DIS.
+
+    Args:
+        loss_weight (float): loss weight.
+        seg_loss (ConfigDict): loss config.
+    """
+
+    def __init__(self,
+                 loss_weight,
+                 seg_loss=dict(
+                     type='mmdet.CrossEntropyLoss', use_sigmoid=True)):
+        self.seg_loss = MODELS.build(seg_loss)
+        self.loss_weight = loss_weight
+
+    def get_layer_loss(self, layer, aux_outputs, insts):
+        """Calculate loss at intermediate level.
+
+        Args:
+            layer (int): transformer layer number
+            aux_outputs (dict): Predictions with List `masks`
+                of len batch_size, each of shape
+                (n_points_i, n_classes + 1).
+            insts (list): Ground truth of len batch_size, 
+                each InstanceData_ with `sp_masks` of shape
+                (n_classes + 1, n_points_i).
+
+        Returns:
+            Dict: with semantic loss value.
+        """
+        pred_masks = aux_outputs['masks']
+        seg_losses = []
+        for pred_mask, gt_mask in zip(pred_masks, insts):
+            seg_loss = self.seg_loss(
+                pred_mask.T, gt_mask.sp_masks.float().argmax(0))
+            seg_losses.append(seg_loss)
+
+        seg_loss = self.loss_weight * torch.mean(torch.stack(seg_losses))
+        return {f'layer_{layer}_seg_loss': seg_loss}
+
+    def __call__(self, pred, insts):
+        """Calculate loss.
+
+        Args:
+            pred (dict): Predictions with List `masks`
+                of len batch_size, each of shape
+                (n_points_i, n_classes + 1).
+            insts (list): Ground truth of len batch_size, 
+                each InstanceData_ with `sp_masks` of shape
+                (n_classes + 1, n_points_i).
+
+        Returns:
+            Dict: with semantic loss value.
+        """
+        pred_masks = pred['masks']
+        seg_losses = []
+        for pred_mask, gt_mask in zip(pred_masks, insts):
+            seg_loss = self.seg_loss(
+                pred_mask.T, gt_mask.sp_masks.float().argmax(0))
+            seg_losses.append(seg_loss)
+
+        seg_loss = self.loss_weight * torch.mean(torch.stack(seg_losses))
+        loss = {'last_layer_seg_loss': seg_loss}
+
+        if 'aux_outputs' in pred:
+            for i, aux_outputs in enumerate(pred['aux_outputs']):
+                loss_i = self.get_layer_loss(i, aux_outputs, insts)
+                loss.update(loss_i)
+
+        return loss
diff --git a/oneformer3d/spconv_unet.py b/oneformer3d/spconv_unet.py
new file mode 100644
index 0000000..f81fcbf
--- /dev/null
+++ b/oneformer3d/spconv_unet.py
@@ -0,0 +1,236 @@
+# Adapted from sunjiahao1999/SPFormer.
+import functools
+from collections import OrderedDict
+
+import spconv.pytorch as spconv
+import torch
+from spconv.pytorch.modules import SparseModule
+from torch import nn
+
+from mmdet3d.registry import MODELS
+
+
+class ResidualBlock(SparseModule):
+    """Resudual block for SpConv U-Net.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int: Number of output channels.
+        norm_fn (Callable): Normalization function constructor.
+        indice_key (str): SpConv key for conv layer.
+        normalize_before (bool): Wheter to call norm before conv.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_fn=functools.partial(
+                     nn.BatchNorm1d, eps=1e-4, momentum=0.1),
+                 indice_key=None,
+                 normalize_before=True):
+        super().__init__()
+
+        if in_channels == out_channels:
+            self.i_branch = spconv.SparseSequential(nn.Identity())
+        else:
+            self.i_branch = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels, out_channels, kernel_size=1, bias=False))
+
+        if normalize_before:
+            self.conv_branch = spconv.SparseSequential(
+                norm_fn(in_channels), nn.ReLU(),
+                spconv.SubMConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    bias=False,
+                    indice_key=indice_key), norm_fn(out_channels), nn.ReLU(),
+                spconv.SubMConv3d(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    bias=False,
+                    indice_key=indice_key))
+        else:
+            self.conv_branch = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    bias=False,
+                    indice_key=indice_key), norm_fn(out_channels), nn.ReLU(),
+                spconv.SubMConv3d(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    bias=False,
+                    indice_key=indice_key), norm_fn(out_channels), nn.ReLU())
+
+    def forward(self, input):
+        """Forward pass.
+
+        Args:
+            input (SparseConvTensor): Input tensor.
+        
+        Returns:
+            SparseConvTensor: Output tensor.
+        """
+        identity = spconv.SparseConvTensor(input.features, input.indices,
+                                           input.spatial_shape,
+                                           input.batch_size)
+
+        output = self.conv_branch(input)
+        output = output.replace_feature(output.features +
+                                        self.i_branch(identity).features)
+
+        return output
+
+
+@MODELS.register_module()
+class SpConvUNet(nn.Module):
+    """SpConv U-Net model.
+
+    Args:
+        num_planes (List[int]): Number of channels in each level.
+        norm_fn (Callable): Normalization function constructor.
+        block_reps (int): Times to repeat each block.
+        block (Callable): Block base class.
+        indice_key_id (int): Id of current level.
+        normalize_before (bool): Wheter to call norm before conv. 
+        return_blocks (bool): Whether to return previous blocks.
+    """
+
+    def __init__(self,
+                 num_planes,
+                 norm_fn=functools.partial(
+                      nn.BatchNorm1d, eps=1e-4, momentum=0.1),
+                 block_reps=2,
+                 block=ResidualBlock,
+                 indice_key_id=1,
+                 normalize_before=True,
+                 return_blocks=False):
+        super().__init__()
+        self.return_blocks = return_blocks
+        self.num_planes = num_planes
+
+        # process block and norm_fn caller
+        if isinstance(block, str):
+            area = ['residual', 'vgg', 'asym']
+            assert block in area, f'block must be in {area}, but got {block}'
+            if block == 'residual':
+                block = ResidualBlock
+
+        blocks = {
+            f'block{i}': block(
+                num_planes[0],
+                num_planes[0],
+                norm_fn,
+                normalize_before=normalize_before,
+                indice_key=f'subm{indice_key_id}')
+            for i in range(block_reps)
+        }
+        blocks = OrderedDict(blocks)
+        self.blocks = spconv.SparseSequential(blocks)
+
+        if len(num_planes) > 1:
+            if normalize_before:
+                self.conv = spconv.SparseSequential(
+                    norm_fn(num_planes[0]), nn.ReLU(),
+                    spconv.SparseConv3d(
+                        num_planes[0],
+                        num_planes[1],
+                        kernel_size=2,
+                        stride=2,
+                        bias=False,
+                        indice_key=f'spconv{indice_key_id}'))
+            else:
+                self.conv = spconv.SparseSequential(
+                    spconv.SparseConv3d(
+                        num_planes[0],
+                        num_planes[1],
+                        kernel_size=2,
+                        stride=2,
+                        bias=False,
+                        indice_key=f'spconv{indice_key_id}'),
+                    norm_fn(num_planes[1]), nn.ReLU())
+
+            self.u = SpConvUNet(
+                num_planes[1:],
+                norm_fn,
+                block_reps,
+                block,
+                indice_key_id=indice_key_id + 1,
+                normalize_before=normalize_before,
+                return_blocks=return_blocks)
+
+            if normalize_before:
+                self.deconv = spconv.SparseSequential(
+                    norm_fn(num_planes[1]), nn.ReLU(),
+                    spconv.SparseInverseConv3d(
+                        num_planes[1],
+                        num_planes[0],
+                        kernel_size=2,
+                        bias=False,
+                        indice_key=f'spconv{indice_key_id}'))
+            else:
+                self.deconv = spconv.SparseSequential(
+                    spconv.SparseInverseConv3d(
+                        num_planes[1],
+                        num_planes[0],
+                        kernel_size=2,
+                        bias=False,
+                        indice_key=f'spconv{indice_key_id}'),
+                    norm_fn(num_planes[0]), nn.ReLU())
+
+            blocks_tail = {}
+            for i in range(block_reps):
+                blocks_tail[f'block{i}'] = block(
+                    num_planes[0] * (2 - i),
+                    num_planes[0],
+                    norm_fn,
+                    indice_key=f'subm{indice_key_id}',
+                    normalize_before=normalize_before)
+            blocks_tail = OrderedDict(blocks_tail)
+            self.blocks_tail = spconv.SparseSequential(blocks_tail)
+
+    def forward(self, input, previous_outputs=None):
+        """Forward pass.
+
+        Args:
+            input (SparseConvTensor): Input tensor.
+            previous_outputs (List[SparseConvTensor]): Previous imput tensors.
+
+        Returns:
+            SparseConvTensor: Output tensor.
+        """
+        output = self.blocks(input)
+        identity = spconv.SparseConvTensor(output.features, output.indices,
+                                           output.spatial_shape,
+                                           output.batch_size)
+
+        if len(self.num_planes) > 1:
+            output_decoder = self.conv(output)
+            if self.return_blocks:
+                output_decoder, previous_outputs = self.u(
+                    output_decoder, previous_outputs)
+            else:
+                output_decoder = self.u(output_decoder)
+            output_decoder = self.deconv(output_decoder)
+
+            output = output.replace_feature(
+                torch.cat((identity.features, output_decoder.features), dim=1))
+            output = self.blocks_tail(output)
+
+        if self.return_blocks:
+            # NOTE: to avoid the residual bug
+            if previous_outputs is None:
+                previous_outputs = []
+            previous_outputs.append(output)
+            return output, previous_outputs
+        else:
+            return output
diff --git a/oneformer3d/structured3d_dataset.py b/oneformer3d/structured3d_dataset.py
new file mode 100644
index 0000000..c19cbb1
--- /dev/null
+++ b/oneformer3d/structured3d_dataset.py
@@ -0,0 +1,88 @@
+import numpy as np
+
+from mmengine.dataset.dataset_wrapper import ConcatDataset 
+from mmengine.dataset.base_dataset import BaseDataset
+from mmdet3d.datasets.seg3d_dataset import Seg3DDataset
+from mmdet3d.registry import DATASETS
+
+
+@DATASETS.register_module()
+class Structured3DSegDataset(Seg3DDataset):
+    METAINFO = {
+        'classes':
+        ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door',
+         'window', 'picture', 'counter', 'desk', 'shelves', 'curtain',
+         'dresser', 'pillow', 'mirror', 'ceiling', 'fridge', 'television',
+         'night stand', 'toilet', 'sink', 'lamp', 'bathtub', 'structure',
+         'furniture', 'prop'),
+        'palette': [[135, 141, 249], [91, 186, 154], [134, 196, 138],
+                    [205, 82, 150], [245, 38, 29], [238, 130,249], [189, 22, 4],
+                    [128, 94, 103], [121, 74, 63], [98, 252, 9], [227, 8, 226],
+                    [224, 58, 233], [244, 26, 146], [50, 62, 237],
+                    [141, 30, 106], [60, 187, 63], [206, 106, 254],
+                    [164, 85, 194], [187, 218, 244], [244, 140, 56],
+                    [118, 8, 242], [88, 60, 134], [230, 110, 157],
+                    [174, 48, 170], [3, 119, 80], [69, 148, 166],
+                    [171, 16, 47], [81, 66, 251]],
+        'seg_valid_class_ids':
+        (1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 22, 24, 25,
+         32, 33, 34, 35, 36, 38, 39, 40),
+        'seg_all_class_ids':
+        tuple(range(41)),
+    }
+
+    def get_scene_idxs(self, scene_idxs):
+        """Compute scene_idxs for data sampling.
+
+        We sample more times for scenes with more points.
+        """
+        return np.arange(len(self)).astype(np.int32)
+
+
+@DATASETS.register_module()
+class ConcatDataset_(ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Args:
+        datasets (Sequence[BaseDataset] or Sequence[dict]): A list of datasets
+            which will be concatenated.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. Defaults to False.
+        ignore_keys (List[str] or str): Ignore the keys that can be
+            unequal in `dataset.metainfo`. Defaults to None.
+            `New in version 0.3.0.`
+    """
+
+    def __init__(self,
+                 datasets,
+                 lazy_init=False,
+                 ignore_keys=None):
+        self.datasets = []
+        for i, dataset in enumerate(datasets):
+            if isinstance(dataset, dict):
+                self.datasets.append(DATASETS.build(dataset))
+            elif isinstance(dataset, BaseDataset):
+                self.datasets.append(dataset)
+            else:
+                raise TypeError(
+                    'elements in datasets sequence should be config or '
+                    f'`BaseDataset` instance, but got {type(dataset)}')
+        if ignore_keys is None:
+            self.ignore_keys = []
+        elif isinstance(ignore_keys, str):
+            self.ignore_keys = [ignore_keys]
+        elif isinstance(ignore_keys, list):
+            self.ignore_keys = ignore_keys
+        else:
+            raise TypeError('ignore_keys should be a list or str, '
+                            f'but got {type(ignore_keys)}')
+
+        meta_keys: set = set()
+        for dataset in self.datasets:
+            meta_keys |= dataset.metainfo.keys()
+        # Only use metainfo of first dataset.
+        self._metainfo = self.datasets[0].metainfo
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
diff --git a/oneformer3d/structures.py b/oneformer3d/structures.py
new file mode 100644
index 0000000..2037bb5
--- /dev/null
+++ b/oneformer3d/structures.py
@@ -0,0 +1,25 @@
+from collections.abc import Sized
+from mmengine.structures import InstanceData
+
+
+class InstanceData_(InstanceData):
+    """We only remove a single assert from __setattr__."""
+
+    def __setattr__(self, name: str, value: Sized):
+        """setattr is only used to set data.
+
+        The value must have the attribute of `__len__` and have the same length
+        of `InstanceData`.
+        """
+        if name in ('_metainfo_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super(InstanceData, self).__setattr__(name, value)
+            else:
+                raise AttributeError(f'{name} has been used as a '
+                                     'private attribute, which is immutable.')
+
+        else:
+            assert isinstance(value,
+                              Sized), 'value must contain `__len__` attribute'
+
+            super(InstanceData, self).__setattr__(name, value)
diff --git a/oneformer3d/transforms_3d.py b/oneformer3d/transforms_3d.py
new file mode 100644
index 0000000..242d306
--- /dev/null
+++ b/oneformer3d/transforms_3d.py
@@ -0,0 +1,408 @@
+import numpy as np
+import scipy
+import torch
+from torch_scatter import scatter_mean
+from mmcv.transforms import BaseTransform
+from mmdet3d.datasets.transforms import PointSample
+
+from mmdet3d.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class ElasticTransfrom(BaseTransform):
+    """Apply elastic augmentation to a 3D scene. Required Keys:
+
+    Args:
+        gran (List[float]): Size of the noise grid (in same scale[m/cm]
+            as the voxel grid).
+        mag (List[float]): Noise multiplier.
+        voxel_size (float): Voxel size.
+        p (float): probability of applying this transform.
+    """
+
+    def __init__(self, gran, mag, voxel_size, p=1.0):
+        self.gran = gran
+        self.mag = mag
+        self.voxel_size = voxel_size
+        self.p = p
+
+    def transform(self, input_dict):
+        """Private function-wrapper for elastic transform.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        
+        Returns:
+            dict: Results after elastic, 'points' is updated
+            in the result dict.
+        """
+        coords = input_dict['points'].tensor[:, :3].numpy() / self.voxel_size
+        if np.random.rand() < self.p:
+            coords = self.elastic(coords, self.gran[0], self.mag[0])
+            coords = self.elastic(coords, self.gran[1], self.mag[1])
+        input_dict['elastic_coords'] = coords
+        return input_dict
+
+    def elastic(self, x, gran, mag):
+        """Private function for elastic transform to a points.
+
+        Args:
+            x (ndarray): Point cloud.
+            gran (List[float]): Size of the noise grid (in same scale[m/cm]
+                as the voxel grid).
+            mag: (List[float]): Noise multiplier.
+        
+        Returns:
+            dict: Results after elastic, 'points' is updated
+                in the result dict.
+        """
+        blur0 = np.ones((3, 1, 1)).astype('float32') / 3
+        blur1 = np.ones((1, 3, 1)).astype('float32') / 3
+        blur2 = np.ones((1, 1, 3)).astype('float32') / 3
+
+        noise_dim = np.abs(x).max(0).astype(np.int32) // gran + 3
+        noise = [
+            np.random.randn(noise_dim[0], noise_dim[1],
+                            noise_dim[2]).astype('float32') for _ in range(3)
+        ]
+
+        for blur in [blur0, blur1, blur2, blur0, blur1, blur2]:
+            noise = [
+                scipy.ndimage.filters.convolve(
+                    n, blur, mode='constant', cval=0) for n in noise
+            ]
+
+        ax = [
+            np.linspace(-(b - 1) * gran, (b - 1) * gran, b) for b in noise_dim
+        ]
+        interp = [
+            scipy.interpolate.RegularGridInterpolator(
+                ax, n, bounds_error=0, fill_value=0) for n in noise
+        ]
+
+        return x + np.hstack([i(x)[:, None] for i in interp]) * mag
+
+
+@TRANSFORMS.register_module()
+class AddSuperPointAnnotations(BaseTransform):
+    """Prepare ground truth markup for training.
+    
+    Required Keys:
+    - pts_semantic_mask (np.float32)
+    
+    Added Keys:
+    - gt_sp_masks (np.int64)
+    
+    Args:
+        num_classes (int): Number of classes.
+    """
+    
+    def __init__(self,
+                 num_classes,
+                 stuff_classes,
+                 merge_non_stuff_cls=True):
+        self.num_classes = num_classes
+        self.stuff_classes = stuff_classes
+        self.merge_non_stuff_cls = merge_non_stuff_cls
+ 
+    def transform(self, input_dict):
+        """Private function for preparation ground truth 
+        markup for training.
+        
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        
+        Returns:
+            dict: results, 'gt_sp_masks' is added.
+        """
+        # create class mapping
+        # because pts_instance_mask contains instances from non-instaces classes
+        pts_instance_mask = torch.tensor(input_dict['pts_instance_mask'])
+        pts_semantic_mask = torch.tensor(input_dict['pts_semantic_mask'])
+        
+        pts_instance_mask[pts_semantic_mask == self.num_classes] = -1
+        for stuff_cls in self.stuff_classes:
+            pts_instance_mask[pts_semantic_mask == stuff_cls] = -1
+        
+        idxs = torch.unique(pts_instance_mask)
+        assert idxs[0] == -1
+
+        mapping = torch.zeros(torch.max(idxs) + 2, dtype=torch.long)
+        new_idxs = torch.arange(len(idxs), device=idxs.device)
+        mapping[idxs] = new_idxs - 1
+        pts_instance_mask = mapping[pts_instance_mask]
+        input_dict['pts_instance_mask'] = pts_instance_mask.numpy()
+
+
+        # create gt instance markup     
+        insts_mask = pts_instance_mask.clone()
+        
+        if torch.sum(insts_mask == -1) != 0:
+            insts_mask[insts_mask == -1] = torch.max(insts_mask) + 1
+            insts_mask = torch.nn.functional.one_hot(insts_mask)[:, :-1]
+        else:
+            insts_mask = torch.nn.functional.one_hot(insts_mask)
+
+        if insts_mask.shape[1] != 0:
+            insts_mask = insts_mask.T
+            sp_pts_mask = torch.tensor(input_dict['sp_pts_mask'])
+            sp_masks_inst = scatter_mean(
+                insts_mask.float(), sp_pts_mask, dim=-1)
+            sp_masks_inst = sp_masks_inst > 0.5
+        else:
+            sp_masks_inst = insts_mask.new_zeros(
+                (0, input_dict['sp_pts_mask'].max() + 1), dtype=torch.bool)
+
+        num_stuff_cls = len(self.stuff_classes)
+        insts = new_idxs[1:] - 1
+        if self.merge_non_stuff_cls:
+            gt_labels = insts.new_zeros(len(insts) + num_stuff_cls + 1)
+        else:
+            gt_labels = insts.new_zeros(len(insts) + self.num_classes + 1)
+
+        for inst in insts:
+            index = pts_semantic_mask[pts_instance_mask == inst][0]
+            gt_labels[inst] = index - num_stuff_cls
+        
+        input_dict['gt_labels_3d'] = gt_labels.numpy()
+
+        # create gt semantic markup
+        sem_mask = torch.tensor(input_dict['pts_semantic_mask'])
+        sem_mask = torch.nn.functional.one_hot(sem_mask, 
+                                    num_classes=self.num_classes + 1)
+       
+        sem_mask = sem_mask.T
+        sp_pts_mask = torch.tensor(input_dict['sp_pts_mask'])
+        sp_masks_seg = scatter_mean(sem_mask.float(), sp_pts_mask, dim=-1)
+        sp_masks_seg = sp_masks_seg > 0.5
+
+        sp_masks_seg[-1, sp_masks_seg.sum(axis=0) == 0] = True
+
+        assert sp_masks_seg.sum(axis=0).max().item()
+        
+        if self.merge_non_stuff_cls:
+            sp_masks_seg = torch.vstack((
+                sp_masks_seg[:num_stuff_cls, :], 
+                sp_masks_seg[num_stuff_cls:, :].sum(axis=0).unsqueeze(0)))
+        
+        sp_masks_all = torch.vstack((sp_masks_inst, sp_masks_seg))
+
+        input_dict['gt_sp_masks'] = sp_masks_all.numpy()
+
+        # create eval markup
+        if 'eval_ann_info' in input_dict.keys(): 
+            pts_instance_mask[pts_instance_mask != -1] += num_stuff_cls
+            for idx, stuff_cls in enumerate(self.stuff_classes):
+                pts_instance_mask[pts_semantic_mask == stuff_cls] = idx
+
+            input_dict['eval_ann_info']['pts_instance_mask'] = \
+                pts_instance_mask.numpy()
+
+        return input_dict
+
+
+@TRANSFORMS.register_module()
+class SwapChairAndFloor(BaseTransform):
+    """Swap two categories for ScanNet200 dataset. It is convenient for
+    panoptic evaluation. After this swap first two categories are
+    `stuff` and other 198 are `thing`.
+    """
+    def transform(self, input_dict):
+        """Private function-wrapper for swap transform.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        
+        Returns:
+            dict: Results after swap, 'pts_semantic_mask' is updated
+                in the result dict.
+        """
+        mask = input_dict['pts_semantic_mask'].copy()
+        mask[input_dict['pts_semantic_mask'] == 2] = 3
+        mask[input_dict['pts_semantic_mask'] == 3] = 2
+        input_dict['pts_semantic_mask'] = mask
+        if 'eval_ann_info' in input_dict:
+            input_dict['eval_ann_info']['pts_semantic_mask'] = mask
+        return input_dict
+
+
+@TRANSFORMS.register_module()
+class PointInstClassMapping_(BaseTransform):
+    """Delete instances from non-instaces classes.
+
+    Required Keys:
+    - pts_instance_mask (np.float32)
+    - pts_semantic_mask (np.float32)
+
+    Modified Keys:
+    - pts_instance_mask (np.float32)
+    - pts_semantic_mask (np.float32)
+
+    Added Keys:
+    - gt_labels_3d (int)
+
+    Args:
+        num_classes (int): Number of classes.
+    """
+
+    def __init__(self, num_classes, structured3d=False):
+        self.num_classes = num_classes
+        self.structured3d = structured3d
+
+    def transform(self, input_dict):
+        """Private function for deleting 
+            instances from non-instaces classes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: results, 'pts_instance_mask', 'pts_semantic_mask',
+            are updated in the result dict. 'gt_labels_3d' is added.
+        """
+
+        # because pts_instance_mask contains instances from non-instaces 
+        # classes
+        pts_instance_mask = np.array(input_dict['pts_instance_mask'])
+        pts_semantic_mask = input_dict['pts_semantic_mask']
+
+        if self.structured3d:
+            # wall as one instance
+            pts_instance_mask[pts_semantic_mask == 0] = \
+                pts_instance_mask.max() + 1
+            # floor as one instance
+            pts_instance_mask[pts_semantic_mask == 1] = \
+                pts_instance_mask.max() + 1
+        
+        pts_instance_mask[pts_semantic_mask == self.num_classes] = -1
+        pts_semantic_mask[pts_semantic_mask == self.num_classes] = -1
+
+        idxs = np.unique(pts_instance_mask)
+        mapping = np.zeros(np.max(idxs) + 2, dtype=int)
+        new_idxs = np.arange(len(idxs))
+        if idxs[0] == -1:
+            mapping[idxs] = new_idxs - 1
+            new_idxs = new_idxs[:-1]
+        else:
+            mapping[idxs] = new_idxs
+        pts_instance_mask = mapping[pts_instance_mask]
+
+        input_dict['pts_instance_mask'] = pts_instance_mask
+        input_dict['pts_semantic_mask'] = pts_semantic_mask
+
+        gt_labels = np.zeros(len(new_idxs), dtype=int)
+        for inst in new_idxs:
+            gt_labels[inst] = pts_semantic_mask[pts_instance_mask == inst][0]
+
+        input_dict['gt_labels_3d'] = gt_labels
+
+        return input_dict
+
+
+@TRANSFORMS.register_module()
+class PointSample_(PointSample):
+
+    def _points_random_sampling(self, points, num_samples):
+        """Points random sampling. Sample points to a certain number.
+        
+        Args:
+            points (:obj:`BasePoints`): 3D Points.
+            num_samples (int): Number of samples to be sampled.
+
+        Returns:
+            tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`:
+                - points (:obj:`BasePoints`): 3D Points.
+                - choices (np.ndarray, optional): The generated random samples.
+        """
+
+        point_range = range(len(points))
+        choices = np.random.choice(point_range, 
+                                   min(num_samples, len(points)))
+        
+        return points[choices], choices
+
+    def transform(self, input_dict):
+        """Transform function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask',
+            'pts_semantic_mask', sp_pts_mask' keys are updated in the 
+            result dict.
+        """
+        points = input_dict['points']
+        points, choices = self._points_random_sampling(
+            points, self.num_points)
+        input_dict['points'] = points
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+        sp_pts_mask = input_dict.get('sp_pts_mask', None)
+
+        if pts_instance_mask is not None:
+            pts_instance_mask = pts_instance_mask[choices]
+            
+            idxs = np.unique(pts_instance_mask)
+            mapping = np.zeros(np.max(idxs) + 2, dtype=int)
+            new_idxs = np.arange(len(idxs))
+            if idxs[0] == -1:
+                mapping[idxs] = new_idxs - 1
+            else:
+                mapping[idxs] = new_idxs
+            pts_instance_mask = mapping[pts_instance_mask]
+
+            input_dict['pts_instance_mask'] = pts_instance_mask
+
+        if pts_semantic_mask is not None:
+            pts_semantic_mask = pts_semantic_mask[choices]
+            input_dict['pts_semantic_mask'] = pts_semantic_mask
+
+        if sp_pts_mask is not None:
+            sp_pts_mask = sp_pts_mask[choices]
+            sp_pts_mask = np.unique(
+                sp_pts_mask, return_inverse=True)[1]
+            input_dict['sp_pts_mask'] = sp_pts_mask
+        return input_dict
+    
+@TRANSFORMS.register_module()
+class SkipEmptyScene(BaseTransform):
+    """Skip empty scene during training.
+
+    Required Keys:
+    - pts_instance_mask (np.float32)
+    - pts_semantic_mask (np.float32)
+    - points (:obj:`BasePoints`)
+    - gt_labels_3d (int)
+
+    Modified Keys:
+    - pts_instance_mask (np.float32)
+    - pts_semantic_mask (np.float32)
+    - points (:obj:`BasePoints`)
+    - gt_labels_3d (int)
+
+    """
+
+    def transform(self, input_dict):
+        """Private function for skipping empty scene during training.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: results, 'pts_instance_mask', 'pts_semantic_mask',
+            'points', 'gt_labels_3d' are updated in the result dict.
+        """
+
+        if len(input_dict['gt_labels_3d']) != 0:
+            self.inst = input_dict['pts_instance_mask']
+            self.sem = input_dict['pts_semantic_mask']
+            self.gt_labels = input_dict['gt_labels_3d']
+            self.points = input_dict['points']
+        else:
+            input_dict['pts_instance_mask'] = self.inst
+            input_dict['pts_semantic_mask'] = self.sem 
+            input_dict['gt_labels_3d'] = self.gt_labels
+            input_dict['points'] = self.points
+
+        return input_dict
diff --git a/oneformer3d/unified_criterion.py b/oneformer3d/unified_criterion.py
new file mode 100644
index 0000000..da4a718
--- /dev/null
+++ b/oneformer3d/unified_criterion.py
@@ -0,0 +1,161 @@
+from mmdet3d.registry import MODELS
+from .structures import InstanceData_
+
+
+@MODELS.register_module()
+class ScanNetUnifiedCriterion:
+    """Simply call semantic and instance criterions.
+
+    Args:
+        num_semantic_classes (int): Number of semantic classes.
+        sem_criterion (ConfigDict): Class for semantic loss calculation.
+        inst_criterion (ConfigDict): Class for instance loss calculation.
+    """
+
+    def __init__(self, num_semantic_classes, sem_criterion, inst_criterion):
+        self.num_semantic_classes = num_semantic_classes
+        self.sem_criterion = MODELS.build(sem_criterion)
+        self.inst_criterion = MODELS.build(inst_criterion)
+    
+    def __call__(self, pred, insts):
+        """Calculate loss.
+
+        Args:
+            pred (Dict):
+                List `cls_preds` of shape len batch_size, each of shape
+                    (n_queries, n_classes + 1)
+                List `scores` of len batch_size each of shape (n_queries, 1)
+                List `masks` of len batch_size each of shape
+                    (n_queries, n_points)
+                Dict `aux_preds` with list of cls_preds, scores, and masks
+                List `sem_preds` of len batch_size each of shape
+                    (n_queries, n_classes + 1).
+            insts (list): Ground truth of len batch_size,
+                each InstanceData_ with
+                    `sp_masks` of shape (n_gts_i + n_classes + 1, n_points_i)
+                    `labels_3d` of shape (n_gts_i + n_classes + 1,)
+                    `query_masks` of shape
+                        (n_gts_i + n_classes + 1, n_queries_i).
+
+        Returns:
+            Dict: with semantic and instance loss values.
+        """
+        sem_gts = []
+        inst_gts = []
+        n = self.num_semantic_classes
+
+        for i in range(len(pred['masks'])):
+            sem_gt = InstanceData_()
+            if insts[i].get('query_masks') is not None:
+                sem_gt.sp_masks = insts[i].query_masks[-n - 1:, :]
+            else:
+                sem_gt.sp_masks = insts[i].sp_masks[-n - 1:, :]
+            sem_gts.append(sem_gt)
+
+            inst_gt = InstanceData_()
+            inst_gt.sp_masks = insts[i].sp_masks[:-n - 1, :]
+            inst_gt.labels_3d = insts[i].labels_3d[:-n - 1]
+            if insts[i].get('query_masks') is not None:
+                inst_gt.query_masks = insts[i].query_masks[:-n - 1, :]
+            inst_gts.append(inst_gt)
+        
+        loss = self.inst_criterion(pred, inst_gts)
+        loss.update(self.sem_criterion(pred, sem_gts))
+        return loss
+
+@MODELS.register_module()
+class S3DISUnifiedCriterion:
+    """Simply call semantic and instance criterions.
+
+    Args:
+        num_semantic_classes (int): Number of semantic classes.
+        sem_criterion (ConfigDict): Class for semantic loss calculation.
+        inst_criterion (ConfigDict): Class for instance loss calculation.
+    """
+
+    def __init__(self, num_semantic_classes, sem_criterion, inst_criterion):
+        self.num_semantic_classes = num_semantic_classes
+        self.sem_criterion = MODELS.build(sem_criterion)
+        self.inst_criterion = MODELS.build(inst_criterion)
+
+    def __call__(self, pred, insts):
+        """Calculate loss.
+
+        Args:
+            pred (Dict):
+                List `cls_preds` of shape len batch_size, each of shape
+                    (n_queries, n_classes + 1)
+                List `scores` of len batch_size each of shape (n_queries, 1)
+                List `masks` of len batch_size each of shape
+                    (n_queries, n_points)
+                Dict `aux_preds` with list of cls_preds, scores, and masks
+            insts (list): Ground truth of len batch_size,
+                each InstanceData_ with
+                    `sp_inst_masks` of shape 
+                        (n_gts_i, n_points_i)
+                    `sp_sem_masks` of shape 
+                        (n_classes + 1, n_points_i)
+                    `labels_3d` of shape (n_gts_i + n_classes + 1,).
+
+        Returns:
+            Dict: with semantic and instance loss values.
+        """
+        pred_masks = pred['masks']
+        pred_cls = pred['cls_preds']
+        pred_scores = pred['scores']
+        
+        sem_preds = []
+        sem_gts = []
+        inst_gts = []
+        n = self.num_semantic_classes
+        for i in range(len(pred_masks)):
+            sem_preds.append(pred_masks[i][-n:, :])
+            pred_masks[i] = pred_masks[i][:-n, :]
+            pred_cls[i] = pred_cls[i][:-n, :]
+            pred_scores[i] = pred_scores[i][:-n, :]
+            
+            sem_gt = InstanceData_()
+            inst_gt = InstanceData_()
+            sem_gt.sp_masks = insts[i].sp_sem_masks
+            sem_gts.append(sem_gt)
+            inst_gt.sp_masks = insts[i].sp_inst_masks
+            inst_gt.labels_3d = insts[i].labels_3d
+            inst_gts.append(inst_gt)
+
+        if 'aux_outputs' in pred:
+            sem_aux_outputs = []
+            for aux_outputs in pred['aux_outputs']:
+                sem_aux_outputs.append(self.prepare_aux_outputs(aux_outputs))
+        
+        loss = self.inst_criterion(pred, inst_gts)
+        loss.update(self.sem_criterion(
+            {'masks': sem_preds, 'aux_outputs': sem_aux_outputs}, sem_gts))
+        return loss
+
+    def prepare_aux_outputs(self, aux_outputs):
+        """Prepare aux outputs for intermediate layers.
+
+        Args:
+            aux_outputs (Dict):
+                List `cls_preds` of shape len batch_size, each of shape
+                    (n_queries, n_classes + 1)
+                List `scores` of len batch_size each of shape (n_queries, 1)
+                List `masks` of len batch_size each of shape
+                    (n_queries, n_points).
+
+        Returns:
+            Dict: with semantic predictions.
+        """
+        pred_masks = aux_outputs['masks']
+        pred_cls = aux_outputs['cls_preds']
+        pred_scores = aux_outputs['scores']
+        
+        sem_preds = []
+        n = self.num_semantic_classes
+        for i in range(len(pred_masks)):
+            sem_preds.append(pred_masks[i][-n:, :])
+            pred_masks[i] = pred_masks[i][:-n, :]
+            pred_cls[i] = pred_cls[i][:-n, :]
+            pred_scores[i] = pred_scores[i][:-n, :]
+
+        return {'masks': sem_preds}
diff --git a/oneformer3d/unified_metric.py b/oneformer3d/unified_metric.py
new file mode 100644
index 0000000..d6e526c
--- /dev/null
+++ b/oneformer3d/unified_metric.py
@@ -0,0 +1,255 @@
+import torch
+import numpy as np
+
+from mmengine.logging import MMLogger
+
+from mmdet3d.evaluation import InstanceSegMetric
+from mmdet3d.evaluation.metrics import SegMetric
+from mmdet3d.registry import METRICS
+from mmdet3d.evaluation import panoptic_seg_eval, seg_eval
+from .instance_seg_eval import instance_seg_eval
+
+
+@METRICS.register_module()
+class UnifiedSegMetric(SegMetric):
+    """Metric for instance, semantic, and panoptic evaluation.
+    The order of classes must be [stuff classes, thing classes, unlabeled].
+
+    Args:
+        thing_class_inds (List[int]): Ids of thing classes.
+        stuff_class_inds (List[int]): Ids of stuff classes.
+        min_num_points (int): Minimal size of mask for panoptic segmentation.
+        id_offset (int): Offset for instance classes.
+        sem_mapping (List[int]): Semantic class to gt id.
+        inst_mapping (List[int]): Instance class to gt id.
+        metric_meta (Dict): Analogue of dataset meta of SegMetric. Keys:
+            `label2cat` (Dict[int, str]): class names,
+            `ignore_index` (List[int]): ids of semantic categories to ignore,
+            `classes` (List[str]): class names.
+        logger_keys (List[Tuple]): Keys for logger to save; of len 3:
+            semantic, instance, and panoptic.
+    """
+
+    def __init__(self,
+                 thing_class_inds,
+                 stuff_class_inds,
+                 min_num_points,
+                 id_offset,
+                 sem_mapping,   
+                 inst_mapping,
+                 metric_meta,
+                 logger_keys=[('miou',),
+                              ('all_ap', 'all_ap_50%', 'all_ap_25%'), 
+                              ('pq',)],
+                 **kwargs):
+        self.thing_class_inds = thing_class_inds
+        self.stuff_class_inds = stuff_class_inds
+        self.min_num_points = min_num_points
+        self.id_offset = id_offset
+        self.metric_meta = metric_meta
+        self.logger_keys = logger_keys
+        self.sem_mapping = np.array(sem_mapping)
+        self.inst_mapping = np.array(inst_mapping)
+        super().__init__(**kwargs)
+
+    def compute_metrics(self, results):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        self.valid_class_ids = self.dataset_meta['seg_valid_class_ids']
+        label2cat = self.metric_meta['label2cat']
+        ignore_index = self.metric_meta['ignore_index']
+        classes = self.metric_meta['classes']
+        thing_classes = [classes[i] for i in self.thing_class_inds]
+        stuff_classes = [classes[i] for i in self.stuff_class_inds]
+        num_stuff_cls = len(stuff_classes)
+
+        gt_semantic_masks_inst_task = []
+        gt_instance_masks_inst_task = []
+        pred_instance_masks_inst_task = []
+        pred_instance_labels = []
+        pred_instance_scores = []
+
+        gt_semantic_masks_sem_task = []
+        pred_semantic_masks_sem_task = []
+
+        gt_masks_pan = []
+        pred_masks_pan = []
+
+        for eval_ann, single_pred_results in results:
+            
+            if self.metric_meta['dataset_name'] == 'S3DIS':
+                pan_gt = {}
+                pan_gt['pts_semantic_mask'] = eval_ann['pts_semantic_mask']
+                pan_gt['pts_instance_mask'] = \
+                                    eval_ann['pts_instance_mask'].copy()
+
+                for stuff_cls in self.stuff_class_inds:
+                    pan_gt['pts_instance_mask'][\
+                        pan_gt['pts_semantic_mask'] == stuff_cls] = \
+                                    np.max(pan_gt['pts_instance_mask']) + 1
+
+                pan_gt['pts_instance_mask'] = np.unique(
+                                                pan_gt['pts_instance_mask'],
+                                                return_inverse=True)[1]
+                gt_masks_pan.append(pan_gt)
+            else:
+                gt_masks_pan.append(eval_ann)
+            
+            pred_masks_pan.append({
+                'pts_instance_mask': \
+                    single_pred_results['pts_instance_mask'][1],
+                'pts_semantic_mask': \
+                    single_pred_results['pts_semantic_mask'][1]
+            })
+
+            gt_semantic_masks_sem_task.append(eval_ann['pts_semantic_mask'])            
+            pred_semantic_masks_sem_task.append(
+                single_pred_results['pts_semantic_mask'][0])
+
+            if self.metric_meta['dataset_name'] == 'S3DIS':
+                gt_semantic_masks_inst_task.append(eval_ann['pts_semantic_mask'])
+                gt_instance_masks_inst_task.append(eval_ann['pts_instance_mask'])  
+            else:
+                sem_mask, inst_mask = self.map_inst_markup(
+                    eval_ann['pts_semantic_mask'].copy(), 
+                    eval_ann['pts_instance_mask'].copy(), 
+                    self.valid_class_ids[num_stuff_cls:],
+                    num_stuff_cls)
+                gt_semantic_masks_inst_task.append(sem_mask)
+                gt_instance_masks_inst_task.append(inst_mask)           
+            
+            pred_instance_masks_inst_task.append(
+                torch.tensor(single_pred_results['pts_instance_mask'][0]))
+            pred_instance_labels.append(
+                torch.tensor(single_pred_results['instance_labels']))
+            pred_instance_scores.append(
+                torch.tensor(single_pred_results['instance_scores']))
+
+        ret_pan = panoptic_seg_eval(
+            gt_masks_pan, pred_masks_pan, classes, thing_classes,
+            stuff_classes, self.min_num_points, self.id_offset,
+            label2cat, ignore_index, logger)
+
+        ret_sem = seg_eval(
+            gt_semantic_masks_sem_task,
+            pred_semantic_masks_sem_task,
+            label2cat,
+            ignore_index[0],
+            logger=logger)
+
+        if self.metric_meta['dataset_name'] == 'S3DIS':
+            # :-1 for unlabeled
+            ret_inst = instance_seg_eval(
+                gt_semantic_masks_inst_task,
+                gt_instance_masks_inst_task,
+                pred_instance_masks_inst_task,
+                pred_instance_labels,
+                pred_instance_scores,
+                valid_class_ids=self.valid_class_ids,
+                class_labels=classes[:-1],
+                logger=logger)
+        else:
+            # :-1 for unlabeled
+            ret_inst = instance_seg_eval(
+                gt_semantic_masks_inst_task,
+                gt_instance_masks_inst_task,
+                pred_instance_masks_inst_task,
+                pred_instance_labels,
+                pred_instance_scores,
+                valid_class_ids=self.valid_class_ids[num_stuff_cls:],
+                class_labels=classes[num_stuff_cls:-1],
+                logger=logger)
+
+        metrics = dict()
+        for ret, keys in zip((ret_sem, ret_inst, ret_pan), self.logger_keys):
+            for key in keys:
+                metrics[key] = ret[key]
+        return metrics
+
+    def map_inst_markup(self,
+                        pts_semantic_mask,
+                        pts_instance_mask,
+                        valid_class_ids,
+                        num_stuff_cls):
+        """Map gt instance and semantic classes back from panoptic annotations.
+
+        Args:
+            pts_semantic_mask (np.array): of shape (n_raw_points,)
+            pts_instance_mask (np.array): of shape (n_raw_points.)
+            valid_class_ids (Tuple): of len n_instance_classes
+            num_stuff_cls (int): number of stuff classes
+        
+        Returns:
+            Tuple:
+                np.array: pts_semantic_mask of shape (n_raw_points,)
+                np.array: pts_instance_mask of shape (n_raw_points,)
+        """
+        pts_instance_mask -= num_stuff_cls
+        pts_instance_mask[pts_instance_mask < 0] = -1
+        pts_semantic_mask -= num_stuff_cls
+        pts_semantic_mask[pts_instance_mask == -1] = -1
+
+        mapping = np.array(list(valid_class_ids) + [-1])
+        pts_semantic_mask = mapping[pts_semantic_mask]
+        
+        return pts_semantic_mask, pts_instance_mask
+
+
+@METRICS.register_module()
+class InstanceSegMetric_(InstanceSegMetric):
+    """The only difference with InstanceSegMetric is that following ScanNet
+    evaluator we accept instance prediction as a boolean tensor of shape
+    (n_points, n_instances) instead of integer tensor of shape (n_points, ).
+
+    For this purpose we only replace instance_seg_eval call.
+    """
+
+    def compute_metrics(self, results):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        self.classes = self.dataset_meta['classes']
+        self.valid_class_ids = self.dataset_meta['seg_valid_class_ids']
+
+        gt_semantic_masks = []
+        gt_instance_masks = []
+        pred_instance_masks = []
+        pred_instance_labels = []
+        pred_instance_scores = []
+
+        for eval_ann, single_pred_results in results:
+            gt_semantic_masks.append(eval_ann['pts_semantic_mask'])
+            gt_instance_masks.append(eval_ann['pts_instance_mask'])
+            pred_instance_masks.append(
+                single_pred_results['pts_instance_mask'])
+            pred_instance_labels.append(single_pred_results['instance_labels'])
+            pred_instance_scores.append(single_pred_results['instance_scores'])
+
+        ret_dict = instance_seg_eval(
+            gt_semantic_masks,
+            gt_instance_masks,
+            pred_instance_masks,
+            pred_instance_labels,
+            pred_instance_scores,
+            valid_class_ids=self.valid_class_ids,
+            class_labels=self.classes,
+            logger=logger)
+
+        return ret_dict
diff --git a/tools/create_data.py b/tools/create_data.py
new file mode 100644
index 0000000..88658d6
--- /dev/null
+++ b/tools/create_data.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from os import path as osp
+
+from indoor_converter import create_indoor_info_file
+from update_infos_to_v2 import update_pkl_infos
+
+
+def scannet_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for scannet dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+    info_train_path = osp.join(out_dir, f'{info_prefix}_oneformer3d_infos_train.pkl')
+    info_val_path = osp.join(out_dir, f'{info_prefix}_oneformer3d_infos_val.pkl')
+    info_test_path = osp.join(out_dir, f'{info_prefix}_oneformer3d_infos_test.pkl')
+    update_pkl_infos(info_prefix, out_dir=out_dir, pkl_path=info_train_path)
+    update_pkl_infos(info_prefix, out_dir=out_dir, pkl_path=info_val_path)
+    update_pkl_infos(info_prefix, out_dir=out_dir, pkl_path=info_test_path)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/kitti',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/kitti',
+    required=False,
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    from mmdet3d.utils import register_all_modules
+    register_all_modules()
+
+    if args.dataset in ('scannet', 'scannet200'):
+        scannet_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    else:
+        raise NotImplementedError(f'Don\'t support {args.dataset} dataset.')
diff --git a/tools/fix_spconv_checkpoint.py b/tools/fix_spconv_checkpoint.py
new file mode 100644
index 0000000..b838aaa
--- /dev/null
+++ b/tools/fix_spconv_checkpoint.py
@@ -0,0 +1,18 @@
+import argparse
+import torch
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--in-path', type=str, required=True)
+    parser.add_argument('--out-path', type=str, required=True)
+    args = parser.parse_args()
+
+    checkpoint = torch.load(args.in_path)
+    key = 'state_dict'  # 'model' for SSTNet
+    for layer in checkpoint[key]:
+        if (layer.startswith('unet') or layer.startswith('input_conv')) \
+            and layer.endswith('weight') \
+                and len(checkpoint[key][layer].shape) == 5:
+            checkpoint[key][layer] = checkpoint[key][layer].permute(1, 2, 3, 4, 0)
+    torch.save(checkpoint, args.out_path)
diff --git a/tools/indoor_converter.py b/tools/indoor_converter.py
new file mode 100644
index 0000000..1fc6f74
--- /dev/null
+++ b/tools/indoor_converter.py
@@ -0,0 +1,67 @@
+# Modified from mmdetection3d/tools/dataset_converters/indoor_converter.py
+# We just support ScanNet 200.
+import os
+
+import mmengine
+
+from scannet_data_utils import ScanNetData
+
+
+def create_indoor_info_file(data_path,
+                            pkl_prefix='sunrgbd',
+                            save_path=None,
+                            use_v1=False,
+                            workers=4):
+    """Create indoor information file.
+
+    Get information of the raw data and save it to the pkl file.
+
+    Args:
+        data_path (str): Path of the data.
+        pkl_prefix (str, optional): Prefix of the pkl to be saved.
+            Default: 'sunrgbd'.
+        save_path (str, optional): Path of the pkl to be saved. Default: None.
+        use_v1 (bool, optional): Whether to use v1. Default: False.
+        workers (int, optional): Number of threads to be used. Default: 4.
+    """
+    assert os.path.exists(data_path)
+    assert pkl_prefix in ['scannet', 'scannet200'], \
+        f'unsupported indoor dataset {pkl_prefix}'
+    save_path = data_path if save_path is None else save_path
+    assert os.path.exists(save_path)
+
+    # generate infos for both detection and segmentation task
+    train_filename = os.path.join(
+        save_path, f'{pkl_prefix}_oneformer3d_infos_train.pkl')
+    val_filename = os.path.join(
+        save_path, f'{pkl_prefix}_oneformer3d_infos_val.pkl')
+    test_filename = os.path.join(
+        save_path, f'{pkl_prefix}_oneformer3d_infos_test.pkl')
+    if pkl_prefix == 'scannet':
+        # ScanNet has a train-val-test split
+        train_dataset = ScanNetData(root_path=data_path, split='train')
+        val_dataset = ScanNetData(root_path=data_path, split='val')
+        test_dataset = ScanNetData(root_path=data_path, split='test')
+    else:  # ScanNet200
+        # ScanNet has a train-val-test split
+        train_dataset = ScanNetData(root_path=data_path, split='train',
+                                    scannet200=True, save_path=save_path)
+        val_dataset = ScanNetData(root_path=data_path, split='val',
+                                    scannet200=True, save_path=save_path)
+        test_dataset = ScanNetData(root_path=data_path, split='test',
+                                    scannet200=True, save_path=save_path)
+
+    infos_train = train_dataset.get_infos(
+        num_workers=workers, has_label=True)
+    mmengine.dump(infos_train, train_filename, 'pkl')
+    print(f'{pkl_prefix} info train file is saved to {train_filename}')
+
+    infos_val = val_dataset.get_infos(
+        num_workers=workers, has_label=True)
+    mmengine.dump(infos_val, val_filename, 'pkl')
+    print(f'{pkl_prefix} info val file is saved to {val_filename}')
+
+    infos_test = test_dataset.get_infos(
+        num_workers=workers, has_label=False)
+    mmengine.dump(infos_test, test_filename, 'pkl')
+    print(f'{pkl_prefix} info test file is saved to {test_filename}')
diff --git a/tools/scannet_data_utils.py b/tools/scannet_data_utils.py
new file mode 100644
index 0000000..942b527
--- /dev/null
+++ b/tools/scannet_data_utils.py
@@ -0,0 +1,281 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+import mmengine
+import numpy as np
+
+
+class ScanNetData(object):
+    """ScanNet data.
+    Generate scannet infos for scannet_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str, optional): Set split type of the data. Default: 'train'.
+        scannet200 (bool): True for ScanNet200, else for ScanNet.
+        save_path (str, optional): Output directory.
+    """
+
+    def __init__(self, root_path, split='train', scannet200=False, save_path=None):
+        self.root_dir = root_path
+        self.save_path = root_path if save_path is None else save_path
+        self.split = split
+        self.split_dir = osp.join(root_path)
+        self.scannet200 = scannet200
+        if self.scannet200:
+            self.classes = [
+                'chair', 'table', 'door', 'couch', 'cabinet', 'shelf', 'desk',
+                'office chair', 'bed', 'pillow', 'sink', 'picture', 'window',
+                'toilet', 'bookshelf', 'monitor', 'curtain', 'book',
+                'armchair', 'coffee table', 'box', 'refrigerator', 'lamp',
+                'kitchen cabinet', 'towel', 'clothes', 'tv', 'nightstand',
+                'counter', 'dresser', 'stool', 'cushion', 'plant', 'ceiling',
+                'bathtub', 'end table', 'dining table', 'keyboard', 'bag',
+                'backpack', 'toilet paper', 'printer', 'tv stand',
+                'whiteboard', 'blanket', 'shower curtain', 'trash can',
+                'closet', 'stairs', 'microwave', 'stove', 'shoe',
+                'computer tower', 'bottle', 'bin', 'ottoman', 'bench', 'board',
+                'washing machine', 'mirror', 'copier', 'basket', 'sofa chair',
+                'file cabinet', 'fan', 'laptop', 'shower', 'paper', 'person',
+                'paper towel dispenser', 'oven', 'blinds', 'rack', 'plate',
+                'blackboard', 'piano', 'suitcase', 'rail', 'radiator',
+                'recycling bin', 'container', 'wardrobe', 'soap dispenser',
+                'telephone', 'bucket', 'clock', 'stand', 'light',
+                'laundry basket', 'pipe', 'clothes dryer', 'guitar',
+                'toilet paper holder', 'seat', 'speaker', 'column', 'bicycle',
+                'ladder', 'bathroom stall', 'shower wall', 'cup', 'jacket',
+                'storage bin', 'coffee maker', 'dishwasher',
+                'paper towel roll', 'machine', 'mat', 'windowsill', 'bar',
+                'toaster', 'bulletin board', 'ironing board', 'fireplace',
+                'soap dish', 'kitchen counter', 'doorframe',
+                'toilet paper dispenser', 'mini fridge', 'fire extinguisher',
+                'ball', 'hat', 'shower curtain rod', 'water cooler',
+                'paper cutter', 'tray', 'shower door', 'pillar', 'ledge',
+                'toaster oven', 'mouse', 'toilet seat cover dispenser',
+                'furniture', 'cart', 'storage container', 'scale',
+                'tissue box', 'light switch', 'crate', 'power outlet',
+                'decoration', 'sign', 'projector', 'closet door',
+                'vacuum cleaner', 'candle', 'plunger', 'stuffed animal',
+                'headphones', 'dish rack', 'broom', 'guitar case',
+                'range hood', 'dustpan', 'hair dryer', 'water bottle',
+                'handicap bar', 'purse', 'vent', 'shower floor',
+                'water pitcher', 'mailbox', 'bowl', 'paper bag', 'alarm clock',
+                'music stand', 'projector screen', 'divider',
+                'laundry detergent', 'bathroom counter', 'object',
+                'bathroom vanity', 'closet wall', 'laundry hamper',
+                'bathroom stall door', 'ceiling light', 'trash bin',
+                'dumbbell', 'stair rail', 'tube', 'bathroom cabinet',
+                'cd case', 'closet rod', 'coffee kettle', 'structure',
+                'shower head', 'keyboard piano', 'case of water bottles',
+                'coat rack', 'storage organizer', 'folded chair', 'fire alarm',
+                'power strip', 'calendar', 'poster', 'potted plant', 'luggage',
+                'mattress'
+            ]
+            self.cat_ids = np.array([
+                2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21,
+                22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40,
+                41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58,
+                59, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
+                77, 78, 79, 80, 82, 84, 86, 87, 88, 89, 90, 93, 95, 96, 97, 98,
+                99, 100, 101, 102, 103, 104, 105, 106, 107, 110, 112, 115, 116,
+                118, 120, 121, 122, 125, 128, 130, 131, 132, 134, 136, 138,
+                139, 140, 141, 145, 148, 154, 155, 156, 157, 159, 161, 163,
+                165, 166, 168, 169, 170, 177, 180, 185, 188, 191, 193, 195,
+                202, 208, 213, 214, 221, 229, 230, 232, 233, 242, 250, 261,
+                264, 276, 283, 286, 300, 304, 312, 323, 325, 331, 342, 356,
+                370, 392, 395, 399, 408, 417, 488, 540, 562, 570, 572, 581,
+                609, 748, 776, 1156, 1163, 1164, 1165, 1166, 1167, 1168, 1169,
+                1170, 1171, 1172, 1173, 1174, 1175, 1176, 1178, 1179, 1180,
+                1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190,
+                1191
+            ])
+        else:
+            self.classes = [
+                'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+                'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+                'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+                'garbagebin'
+            ]
+            self.cat_ids = np.array([
+                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39
+            ])
+
+        self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+        self.label2cat = {self.cat2label[t]: t for t in self.cat2label}
+        self.cat_ids2class = {
+            nyu40id: i
+            for i, nyu40id in enumerate(list(self.cat_ids))
+        }
+        assert split in ['train', 'val', 'test']
+        split_file = osp.join(self.root_dir, 'meta_data',
+                              f'scannetv2_{split}.txt')
+        mmengine.check_file_exist(split_file)
+        self.sample_id_list = mmengine.list_from_file(split_file)
+        self.test_mode = (split == 'test')
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_aligned_box_label(self, idx):
+        box_file = osp.join(self.root_dir, 'scannet_instance_data',
+                            f'{idx}_aligned_bbox.npy')
+        mmengine.check_file_exist(box_file)
+        return np.load(box_file)
+
+    def get_unaligned_box_label(self, idx):
+        box_file = osp.join(self.root_dir, 'scannet_instance_data',
+                            f'{idx}_unaligned_bbox.npy')
+        mmengine.check_file_exist(box_file)
+        return np.load(box_file)
+
+    def get_axis_align_matrix(self, idx):
+        matrix_file = osp.join(self.root_dir, 'scannet_instance_data',
+                               f'{idx}_axis_align_matrix.npy')
+        mmengine.check_file_exist(matrix_file)
+        return np.load(matrix_file)
+
+    def get_images(self, idx):
+        paths = []
+        path = osp.join(self.root_dir, 'posed_images', idx)
+        for file in sorted(os.listdir(path)):
+            if file.endswith('.jpg'):
+                paths.append(osp.join('posed_images', idx, file))
+        return paths
+
+    def get_extrinsics(self, idx):
+        extrinsics = []
+        path = osp.join(self.root_dir, 'posed_images', idx)
+        for file in sorted(os.listdir(path)):
+            if file.endswith('.txt') and not file == 'intrinsic.txt':
+                extrinsics.append(np.loadtxt(osp.join(path, file)))
+        return extrinsics
+
+    def get_intrinsics(self, idx):
+        matrix_file = osp.join(self.root_dir, 'posed_images', idx,
+                               'intrinsic.txt')
+        mmengine.check_file_exist(matrix_file)
+        return np.loadtxt(matrix_file)
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int, optional): Number of threads to be used.
+                Default: 4.
+            has_label (bool, optional): Whether the data has label.
+                Default: True.
+            sample_id_list (list[int], optional): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            info = dict()
+            pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+            info['point_cloud'] = pc_info
+            pts_filename = osp.join(self.root_dir, 'scannet_instance_data',
+                                    f'{sample_idx}_vert.npy')
+            points = np.load(pts_filename)
+            mmengine.mkdir_or_exist(osp.join(self.save_path, 'points'))
+            points.tofile(
+                osp.join(self.save_path, 'points', f'{sample_idx}.bin'))
+            info['pts_path'] = osp.join('points', f'{sample_idx}.bin')
+
+            sp_filename = osp.join(self.root_dir, 'scannet_instance_data',
+                                    f'{sample_idx}_sp_label.npy')
+            super_points = np.load(sp_filename)
+            mmengine.mkdir_or_exist(osp.join(self.save_path, 'super_points'))
+            super_points.tofile(
+                osp.join(self.save_path, 'super_points', f'{sample_idx}.bin'))
+            info['super_pts_path'] = osp.join('super_points', f'{sample_idx}.bin')
+
+            # update with RGB image paths if exist
+            if os.path.exists(osp.join(self.root_dir, 'posed_images')):
+                info['intrinsics'] = self.get_intrinsics(sample_idx)
+                all_extrinsics = self.get_extrinsics(sample_idx)
+                all_img_paths = self.get_images(sample_idx)
+                # some poses in ScanNet are invalid
+                extrinsics, img_paths = [], []
+                for extrinsic, img_path in zip(all_extrinsics, all_img_paths):
+                    if np.all(np.isfinite(extrinsic)):
+                        img_paths.append(img_path)
+                        extrinsics.append(extrinsic)
+                info['extrinsics'] = extrinsics
+                info['img_paths'] = img_paths
+
+            if not self.test_mode:
+                pts_instance_mask_path = osp.join(
+                    self.root_dir, 'scannet_instance_data',
+                    f'{sample_idx}_ins_label.npy')
+                pts_semantic_mask_path = osp.join(
+                    self.root_dir, 'scannet_instance_data',
+                    f'{sample_idx}_sem_label.npy')
+
+                pts_instance_mask = np.load(pts_instance_mask_path).astype(
+                    np.int64)
+                pts_semantic_mask = np.load(pts_semantic_mask_path).astype(
+                    np.int64)
+
+                mmengine.mkdir_or_exist(
+                    osp.join(self.save_path, 'instance_mask'))
+                mmengine.mkdir_or_exist(
+                    osp.join(self.save_path, 'semantic_mask'))
+
+                pts_instance_mask.tofile(
+                    osp.join(self.save_path, 'instance_mask',
+                             f'{sample_idx}.bin'))
+                pts_semantic_mask.tofile(
+                    osp.join(self.save_path, 'semantic_mask',
+                             f'{sample_idx}.bin'))
+
+                info['pts_instance_mask_path'] = osp.join(
+                    'instance_mask', f'{sample_idx}.bin')
+                info['pts_semantic_mask_path'] = osp.join(
+                    'semantic_mask', f'{sample_idx}.bin')
+
+            if has_label:
+                annotations = {}
+                # box is of shape [k, 6 + class]
+                aligned_box_label = self.get_aligned_box_label(sample_idx)
+                unaligned_box_label = self.get_unaligned_box_label(sample_idx)
+                annotations['gt_num'] = aligned_box_label.shape[0]
+                if annotations['gt_num'] != 0:
+                    aligned_box = aligned_box_label[:, :-1]  # k, 6
+                    unaligned_box = unaligned_box_label[:, :-1]
+                    classes = aligned_box_label[:, -1]  # k
+                    annotations['name'] = np.array([
+                        self.label2cat[self.cat_ids2class[classes[i]]]
+                        for i in range(annotations['gt_num'])
+                    ])
+                    # default names are given to aligned bbox for compatibility
+                    # we also save unaligned bbox info with marked names
+                    annotations['location'] = aligned_box[:, :3]
+                    annotations['dimensions'] = aligned_box[:, 3:6]
+                    annotations['gt_boxes_upright_depth'] = aligned_box
+                    annotations['unaligned_location'] = unaligned_box[:, :3]
+                    annotations['unaligned_dimensions'] = unaligned_box[:, 3:6]
+                    annotations[
+                        'unaligned_gt_boxes_upright_depth'] = unaligned_box
+                    annotations['index'] = np.arange(
+                        annotations['gt_num'], dtype=np.int32)
+                    annotations['class'] = np.array([
+                        self.cat_ids2class[classes[i]]
+                        for i in range(annotations['gt_num'])
+                    ])
+                axis_align_matrix = self.get_axis_align_matrix(sample_idx)
+                annotations['axis_align_matrix'] = axis_align_matrix  # 4x4
+                info['annos'] = annotations
+            return info
+
+        sample_id_list = sample_id_list if sample_id_list is not None \
+            else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
diff --git a/tools/test.py b/tools/test.py
new file mode 100644
index 0000000..cee82d8
--- /dev/null
+++ b/tools/test.py
@@ -0,0 +1,149 @@
+# This is an exact copy of tools/test.py from open-mmlab/mmdetection3d.
+import argparse
+import os
+import os.path as osp
+
+from mmengine.config import Config, ConfigDict, DictAction
+from mmengine.registry import RUNNERS
+from mmengine.runner import Runner
+
+from mmdet3d.utils import replace_ceph_backend
+
+
+# TODO: support fuse_conv_bn and format_only
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument(
+        '--ceph', action='store_true', help='Use ceph as data storage backend')
+    parser.add_argument(
+        '--show', action='store_true', help='show prediction results')
+    parser.add_argument(
+        '--show-dir',
+        help='directory where painted images will be saved. '
+        'If specified, it will be automatically saved '
+        'to the work_dir/timestamp/show_dir')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.1, help='bbox score threshold')
+    parser.add_argument(
+        '--task',
+        type=str,
+        choices=[
+            'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg',
+            'multi-modality_det'
+        ],
+        help='Determine the visualization method depending on the task.')
+    parser.add_argument(
+        '--wait-time', type=float, default=2, help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument(
+        '--tta', action='store_true', help='Test time augmentation')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/test.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def trigger_visualization_hook(cfg, args):
+    default_hooks = cfg.default_hooks
+    if 'visualization' in default_hooks:
+        visualization_hook = default_hooks['visualization']
+        # Turn on visualization
+        visualization_hook['draw'] = True
+        if args.show:
+            visualization_hook['show'] = True
+            visualization_hook['wait_time'] = args.wait_time
+        if args.show_dir:
+            visualization_hook['test_out_dir'] = args.show_dir
+        all_task_choices = [
+            'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg',
+            'multi-modality_det'
+        ]
+        assert args.task in all_task_choices, 'You must set '\
+            f"'--task' in {all_task_choices} in the command " \
+            'if you want to use visualization hook'
+        visualization_hook['vis_task'] = args.task
+        visualization_hook['score_thr'] = args.score_thr
+    else:
+        raise RuntimeError(
+            'VisualizationHook must be included in default_hooks.'
+            'refer to usage '
+            '"visualization=dict(type=\'VisualizationHook\')"')
+
+    return cfg
+
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+
+    # TODO: We will unify the ceph support approach with other OpenMMLab repos
+    if args.ceph:
+        cfg = replace_ceph_backend(cfg)
+
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    if args.show or args.show_dir:
+        cfg = trigger_visualization_hook(cfg, args)
+
+    if args.tta:
+        # Currently, we only support tta for 3D segmentation
+        # TODO: Support tta for 3D detection
+        assert 'tta_model' in cfg, 'Cannot find ``tta_model`` in config.'
+        assert 'tta_pipeline' in cfg, 'Cannot find ``tta_pipeline`` in config.'
+        cfg.test_dataloader.dataset.pipeline = cfg.tta_pipeline
+        cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model)
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # start testing
+    runner.test()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/train.py b/tools/train.py
new file mode 100644
index 0000000..dd904ed
--- /dev/null
+++ b/tools/train.py
@@ -0,0 +1,135 @@
+# This is an exact copy of tools/train.py from open-mmlab/mmdetection3d.
+import argparse
+import logging
+import os
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.logging import print_log
+from mmengine.registry import RUNNERS
+from mmengine.runner import Runner
+
+from mmdet3d.utils import replace_ceph_backend
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a 3D detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--amp',
+        action='store_true',
+        default=False,
+        help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--auto-scale-lr',
+        action='store_true',
+        help='enable automatically scaling LR.')
+    parser.add_argument(
+        '--resume',
+        nargs='?',
+        type=str,
+        const='auto',
+        help='If specify checkpoint path, resume from it, while if not '
+        'specify, try to auto resume from the latest checkpoint '
+        'in the work directory.')
+    parser.add_argument(
+        '--ceph', action='store_true', help='Use ceph as data storage backend')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+
+    # TODO: We will unify the ceph support approach with other OpenMMLab repos
+    if args.ceph:
+        cfg = replace_ceph_backend(cfg)
+
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    # enable automatic-mixed-precision training
+    if args.amp is True:
+        optim_wrapper = cfg.optim_wrapper.type
+        if optim_wrapper == 'AmpOptimWrapper':
+            print_log(
+                'AMP training is already enabled in your config.',
+                logger='current',
+                level=logging.WARNING)
+        else:
+            assert optim_wrapper == 'OptimWrapper', (
+                '`--amp` is only supported when the optimizer wrapper type is '
+                f'`OptimWrapper` but got {optim_wrapper}.')
+            cfg.optim_wrapper.type = 'AmpOptimWrapper'
+            cfg.optim_wrapper.loss_scale = 'dynamic'
+
+    # enable automatically scaling LR
+    if args.auto_scale_lr:
+        if 'auto_scale_lr' in cfg and \
+                'enable' in cfg.auto_scale_lr and \
+                'base_batch_size' in cfg.auto_scale_lr:
+            cfg.auto_scale_lr.enable = True
+        else:
+            raise RuntimeError('Can not find "auto_scale_lr" or '
+                               '"auto_scale_lr.enable" or '
+                               '"auto_scale_lr.base_batch_size" in your'
+                               ' configuration file.')
+
+    # resume is determined in this priority: resume from > auto_resume
+    if args.resume == 'auto':
+        cfg.resume = True
+        cfg.load_from = None
+    elif args.resume is not None:
+        cfg.resume = True
+        cfg.load_from = args.resume
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # start training
+    runner.train()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/update_infos_to_v2.py b/tools/update_infos_to_v2.py
new file mode 100644
index 0000000..00722d7
--- /dev/null
+++ b/tools/update_infos_to_v2.py
@@ -0,0 +1,417 @@
+# Modified from mmdetection3d/tools/dataset_converters /update_infos_to_v2.py
+"""Convert the annotation pkl to the standard format in OpenMMLab V2.0.
+
+Example:
+    python tools/dataset_converters/update_infos_to_v2.py
+        --dataset kitti
+        --pkl-path ./data/kitti/kitti_infos_train.pkl
+        --out-dir ./kitti_v2/
+"""
+
+import argparse
+import time
+from os import path as osp
+from pathlib import Path
+
+import mmengine
+
+def get_empty_instance():
+    """Empty annotation for single instance."""
+    instance = dict(
+        # (list[float], required): list of 4 numbers representing
+        # the bounding box of the instance, in (x1, y1, x2, y2) order.
+        bbox=None,
+        # (int, required): an integer in the range
+        # [0, num_categories-1] representing the category label.
+        bbox_label=None,
+        #  (list[float], optional): list of 7 (or 9) numbers representing
+        #  the 3D bounding box of the instance,
+        #  in [x, y, z, w, h, l, yaw]
+        #  (or [x, y, z, w, h, l, yaw, vx, vy]) order.
+        bbox_3d=None,
+        # (bool, optional): Whether to use the
+        # 3D bounding box during training.
+        bbox_3d_isvalid=None,
+        # (int, optional): 3D category label
+        # (typically the same as label).
+        bbox_label_3d=None,
+        # (float, optional): Projected center depth of the
+        # 3D bounding box compared to the image plane.
+        depth=None,
+        #  (list[float], optional): Projected
+        #  2D center of the 3D bounding box.
+        center_2d=None,
+        # (int, optional): Attribute labels
+        # (fine-grained labels such as stopping, moving, ignore, crowd).
+        attr_label=None,
+        # (int, optional): The number of LiDAR
+        # points in the 3D bounding box.
+        num_lidar_pts=None,
+        # (int, optional): The number of Radar
+        # points in the 3D bounding box.
+        num_radar_pts=None,
+        # (int, optional): Difficulty level of
+        # detecting the 3D bounding box.
+        difficulty=None,
+        unaligned_bbox_3d=None)
+    return instance
+
+def get_empty_lidar_points():
+    lidar_points = dict(
+        # (int, optional) : Number of features for each point.
+        num_pts_feats=None,
+        # (str, optional): Path of LiDAR data file.
+        lidar_path=None,
+        # (list[list[float]], optional): Transformation matrix
+        # from lidar to ego-vehicle
+        # with shape [4, 4].
+        # (Referenced camera coordinate system is ego in KITTI.)
+        lidar2ego=None,
+    )
+    return lidar_points
+
+
+def get_empty_radar_points():
+    radar_points = dict(
+        # (int, optional) : Number of features for each point.
+        num_pts_feats=None,
+        # (str, optional): Path of RADAR data file.
+        radar_path=None,
+        # Transformation matrix from lidar to
+        # ego-vehicle with shape [4, 4].
+        # (Referenced camera coordinate system is ego in KITTI.)
+        radar2ego=None,
+    )
+    return radar_points
+
+def get_empty_img_info():
+    img_info = dict(
+        # (str, required): the path to the image file.
+        img_path=None,
+        # (int) The height of the image.
+        height=None,
+        # (int) The width of the image.
+        width=None,
+        # (str, optional): Path of the depth map file
+        depth_map=None,
+        # (list[list[float]], optional) : Transformation
+        # matrix from camera to image with
+        # shape [3, 3], [3, 4] or [4, 4].
+        cam2img=None,
+        # (list[list[float]]): Transformation matrix from lidar
+        # or depth to image with shape [4, 4].
+        lidar2img=None,
+        # (list[list[float]], optional) : Transformation
+        # matrix from camera to ego-vehicle
+        # with shape [4, 4].
+        cam2ego=None)
+    return img_info
+
+def get_single_image_sweep(camera_types):
+    single_image_sweep = dict(
+        # (float, optional) : Timestamp of the current frame.
+        timestamp=None,
+        # (list[list[float]], optional) : Transformation matrix
+        # from ego-vehicle to the global
+        ego2global=None)
+    # (dict): Information of images captured by multiple cameras
+    images = dict()
+    for cam_type in camera_types:
+        images[cam_type] = get_empty_img_info()
+    single_image_sweep['images'] = images
+    return single_image_sweep
+
+def get_empty_standard_data_info(
+        camera_types=['CAM0', 'CAM1', 'CAM2', 'CAM3', 'CAM4']):
+
+    data_info = dict(
+        # (str): Sample id of the frame.
+        sample_idx=None,
+        # (str, optional): '000010'
+        token=None,
+        **get_single_image_sweep(camera_types),
+        # (dict, optional): dict contains information
+        # of LiDAR point cloud frame.
+        lidar_points=get_empty_lidar_points(),
+        # (dict, optional) Each dict contains
+        # information of Radar point cloud frame.
+        radar_points=get_empty_radar_points(),
+        # (list[dict], optional): Image sweeps data.
+        image_sweeps=[],
+        lidar_sweeps=[],
+        instances=[],
+        # (list[dict], optional): Required by object
+        # detection, instance  to be ignored during training.
+        instances_ignore=[],
+        # (str, optional): Path of semantic labels for each point.
+        pts_semantic_mask_path=None,
+        # (str, optional): Path of instance labels for each point.
+        pts_instance_mask_path=None)
+    return data_info
+
+
+def clear_instance_unused_keys(instance):
+    keys = list(instance.keys())
+    for k in keys:
+        if instance[k] is None:
+            del instance[k]
+    return instance
+
+
+def clear_data_info_unused_keys(data_info):
+    keys = list(data_info.keys())
+    empty_flag = True
+    for key in keys:
+        # we allow no annotations in datainfo
+        if key in ['instances', 'cam_sync_instances', 'cam_instances']:
+            empty_flag = False
+            continue
+        if isinstance(data_info[key], list):
+            if len(data_info[key]) == 0:
+                del data_info[key]
+            else:
+                empty_flag = False
+        elif data_info[key] is None:
+            del data_info[key]
+        elif isinstance(data_info[key], dict):
+            _, sub_empty_flag = clear_data_info_unused_keys(data_info[key])
+            if sub_empty_flag is False:
+                empty_flag = False
+            else:
+                # sub field is empty
+                del data_info[key]
+        else:
+            empty_flag = False
+
+    return data_info, empty_flag
+
+def update_scannet_infos(pkl_path, out_dir):
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+        time.sleep(5)
+    METAINFO = {
+        'classes':
+        ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+         'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
+         'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin')
+    }
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    print('Start updating:')
+    converted_list = []
+    for ori_info_dict in mmengine.track_iter_progress(data_list):
+        temp_data_info = get_empty_standard_data_info()
+        temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict[
+            'point_cloud']['num_features']
+        temp_data_info['lidar_points']['lidar_path'] = Path(
+            ori_info_dict['pts_path']).name
+        if 'pts_semantic_mask_path' in ori_info_dict:
+            temp_data_info['pts_semantic_mask_path'] = Path(
+                ori_info_dict['pts_semantic_mask_path']).name
+        if 'pts_instance_mask_path' in ori_info_dict:
+            temp_data_info['pts_instance_mask_path'] = Path(
+                ori_info_dict['pts_instance_mask_path']).name
+        if 'super_pts_path' in ori_info_dict:
+            temp_data_info['super_pts_path'] = Path(
+                ori_info_dict['super_pts_path']).name
+
+        # TODO support camera
+        # np.linalg.inv(info['axis_align_matrix'] @ extrinsic): depth2cam
+        anns = ori_info_dict.get('annos', None)
+        ignore_class_name = set()
+        if anns is not None:
+            temp_data_info['axis_align_matrix'] = anns[
+                'axis_align_matrix'].tolist()
+            if anns['gt_num'] == 0:
+                instance_list = []
+            else:
+                num_instances = len(anns['name'])
+                instance_list = []
+                for instance_id in range(num_instances):
+                    empty_instance = get_empty_instance()
+                    empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
+                        instance_id].tolist()
+
+                    if anns['name'][instance_id] in METAINFO['classes']:
+                        empty_instance['bbox_label_3d'] = METAINFO[
+                            'classes'].index(anns['name'][instance_id])
+                    else:
+                        ignore_class_name.add(anns['name'][instance_id])
+                        empty_instance['bbox_label_3d'] = -1
+
+                    empty_instance = clear_instance_unused_keys(empty_instance)
+                    instance_list.append(empty_instance)
+            temp_data_info['instances'] = instance_list
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    # dataset metainfo
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 'scannet'
+    metainfo['info_version'] = '1.1'
+
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+def update_scannet200_infos(pkl_path, out_dir):
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+        time.sleep(5)
+    METAINFO = {
+        'classes':
+        ('chair', 'table', 'door', 'couch', 'cabinet', 'shelf', 'desk',
+         'office chair', 'bed', 'pillow', 'sink', 'picture', 'window',
+         'toilet', 'bookshelf', 'monitor', 'curtain', 'book', 'armchair',
+         'coffee table', 'box', 'refrigerator', 'lamp', 'kitchen cabinet',
+         'towel', 'clothes', 'tv', 'nightstand', 'counter', 'dresser', 'stool',
+         'cushion', 'plant', 'ceiling', 'bathtub', 'end table', 'dining table',
+         'keyboard', 'bag', 'backpack', 'toilet paper', 'printer', 'tv stand',
+         'whiteboard', 'blanket', 'shower curtain', 'trash can', 'closet',
+         'stairs', 'microwave', 'stove', 'shoe', 'computer tower', 'bottle',
+         'bin', 'ottoman', 'bench', 'board', 'washing machine', 'mirror',
+         'copier', 'basket', 'sofa chair', 'file cabinet', 'fan', 'laptop',
+         'shower', 'paper', 'person', 'paper towel dispenser', 'oven',
+         'blinds', 'rack', 'plate', 'blackboard', 'piano', 'suitcase', 'rail',
+         'radiator', 'recycling bin', 'container', 'wardrobe',
+         'soap dispenser', 'telephone', 'bucket', 'clock', 'stand', 'light',
+         'laundry basket', 'pipe', 'clothes dryer', 'guitar',
+         'toilet paper holder', 'seat', 'speaker', 'column', 'bicycle',
+         'ladder', 'bathroom stall', 'shower wall', 'cup', 'jacket',
+         'storage bin', 'coffee maker', 'dishwasher', 'paper towel roll',
+         'machine', 'mat', 'windowsill', 'bar', 'toaster', 'bulletin board',
+         'ironing board', 'fireplace', 'soap dish', 'kitchen counter',
+         'doorframe', 'toilet paper dispenser', 'mini fridge',
+         'fire extinguisher', 'ball', 'hat', 'shower curtain rod',
+         'water cooler', 'paper cutter', 'tray', 'shower door', 'pillar',
+         'ledge', 'toaster oven', 'mouse', 'toilet seat cover dispenser',
+         'furniture', 'cart', 'storage container', 'scale', 'tissue box',
+         'light switch', 'crate', 'power outlet', 'decoration', 'sign',
+         'projector', 'closet door', 'vacuum cleaner', 'candle', 'plunger',
+         'stuffed animal', 'headphones', 'dish rack', 'broom', 'guitar case',
+         'range hood', 'dustpan', 'hair dryer', 'water bottle', 'handicap bar',
+         'purse', 'vent', 'shower floor', 'water pitcher', 'mailbox', 'bowl',
+         'paper bag', 'alarm clock', 'music stand', 'projector screen',
+         'divider', 'laundry detergent', 'bathroom counter', 'object',
+         'bathroom vanity', 'closet wall', 'laundry hamper',
+         'bathroom stall door', 'ceiling light', 'trash bin', 'dumbbell',
+         'stair rail', 'tube', 'bathroom cabinet', 'cd case', 'closet rod',
+         'coffee kettle', 'structure', 'shower head', 'keyboard piano',
+         'case of water bottles', 'coat rack', 'storage organizer',
+         'folded chair', 'fire alarm', 'power strip', 'calendar', 'poster',
+         'potted plant', 'luggage', 'mattress')
+    }
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    print('Start updating:')
+    converted_list = []
+    for ori_info_dict in mmengine.track_iter_progress(data_list):
+        temp_data_info = get_empty_standard_data_info()
+        temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict[
+            'point_cloud']['num_features']
+        temp_data_info['lidar_points']['lidar_path'] = Path(
+            ori_info_dict['pts_path']).name
+        if 'pts_semantic_mask_path' in ori_info_dict:
+            temp_data_info['pts_semantic_mask_path'] = Path(
+                ori_info_dict['pts_semantic_mask_path']).name
+        if 'pts_instance_mask_path' in ori_info_dict:
+            temp_data_info['pts_instance_mask_path'] = Path(
+                ori_info_dict['pts_instance_mask_path']).name
+        if 'super_pts_path' in ori_info_dict:
+            temp_data_info['super_pts_path'] = Path(
+                ori_info_dict['super_pts_path']).name
+
+        # TODO support camera
+        # np.linalg.inv(info['axis_align_matrix'] @ extrinsic): depth2cam
+        anns = ori_info_dict.get('annos', None)
+        ignore_class_name = set()
+        if anns is not None:
+            temp_data_info['axis_align_matrix'] = anns[
+                'axis_align_matrix'].tolist()
+            if anns['gt_num'] == 0:
+                instance_list = []
+            else:
+                num_instances = len(anns['name'])
+                instance_list = []
+                for instance_id in range(num_instances):
+                    empty_instance = get_empty_instance()
+                    empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
+                        instance_id].tolist()
+
+                    if anns['name'][instance_id] in METAINFO['classes']:
+                        empty_instance['bbox_label_3d'] = METAINFO[
+                            'classes'].index(anns['name'][instance_id])
+                    else:
+                        ignore_class_name.add(anns['name'][instance_id])
+                        empty_instance['bbox_label_3d'] = -1
+
+                    empty_instance = clear_instance_unused_keys(empty_instance)
+                    instance_list.append(empty_instance)
+            temp_data_info['instances'] = instance_list
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    # dataset metainfo
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 'scannet200'
+    metainfo['info_version'] = '1.1'
+
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Arg parser for data coords '
+                                     'update due to coords sys refactor.')
+    parser.add_argument(
+        '--dataset', type=str, default='kitti', help='name of dataset')
+    parser.add_argument(
+        '--pkl-path',
+        type=str,
+        default='./data/kitti/kitti_infos_train.pkl ',
+        help='specify the root dir of dataset')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='converted_annotations',
+        required=False,
+        help='output direction of info pkl')
+    args = parser.parse_args()
+    return args
+
+
+def update_pkl_infos(dataset, out_dir, pkl_path):
+    if dataset.lower() == 'scannet':
+        update_scannet_infos(pkl_path=pkl_path, out_dir=out_dir)
+    elif dataset.lower() == 'scannet200':
+        update_scannet200_infos(pkl_path=pkl_path, out_dir=out_dir)
+    else:
+        raise NotImplementedError(f'Do not support convert {dataset} to v2.')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.out_dir is None:
+        args.out_dir = args.root_dir
+    update_pkl_infos(
+        dataset=args.dataset, out_dir=args.out_dir, pkl_path=args.pkl_path)