Spaces:

MohamedRashad
/

Game-Items-Generator

Runtime error

App Files Files Community

MohamedRashad commited on Dec 12, 2024

Commit

73c350d

1 Parent(s): 09758c4

Add initial module structure and base classes for samplers and representations

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
app.py +321 -0
assets/example_image/T.png +0 -0
assets/example_image/typical_building_building.png +0 -0
assets/example_image/typical_building_castle.png +0 -0
assets/example_image/typical_building_colorful_cottage.png +0 -0
assets/example_image/typical_building_maya_pyramid.png +0 -0
assets/example_image/typical_building_mushroom.png +0 -0
assets/example_image/typical_building_space_station.png +0 -0
assets/example_image/typical_creature_dragon.png +0 -0
assets/example_image/typical_creature_elephant.png +0 -0
assets/example_image/typical_creature_furry.png +0 -0
assets/example_image/typical_creature_quadruped.png +0 -0
assets/example_image/typical_creature_robot_crab.png +0 -0
assets/example_image/typical_creature_robot_dinosour.png +0 -0
assets/example_image/typical_creature_rock_monster.png +0 -0
assets/example_image/typical_humanoid_block_robot.png +0 -0
assets/example_image/typical_humanoid_dragonborn.png +0 -0
assets/example_image/typical_humanoid_dwarf.png +0 -0
assets/example_image/typical_humanoid_goblin.png +0 -0
assets/example_image/typical_humanoid_mech.png +0 -0
assets/example_image/typical_misc_crate.png +0 -0
assets/example_image/typical_misc_fireplace.png +0 -0
assets/example_image/typical_misc_gate.png +0 -0
assets/example_image/typical_misc_lantern.png +0 -0
assets/example_image/typical_misc_magicbook.png +0 -0
assets/example_image/typical_misc_mailbox.png +0 -0
assets/example_image/typical_misc_monster_chest.png +0 -0
assets/example_image/typical_misc_paper_machine.png +0 -0
assets/example_image/typical_misc_phonograph.png +0 -0
assets/example_image/typical_misc_portal2.png +0 -0
assets/example_image/typical_misc_storage_chest.png +0 -0
assets/example_image/typical_misc_telephone.png +0 -0
assets/example_image/typical_misc_television.png +0 -0
assets/example_image/typical_misc_workbench.png +0 -0
assets/example_image/typical_vehicle_biplane.png +0 -0
assets/example_image/typical_vehicle_bulldozer.png +0 -0
assets/example_image/typical_vehicle_cart.png +0 -0
assets/example_image/typical_vehicle_excavator.png +0 -0
assets/example_image/typical_vehicle_helicopter.png +0 -0
assets/example_image/typical_vehicle_locomotive.png +0 -0
assets/example_image/typical_vehicle_pirate_ship.png +0 -0
assets/example_image/weatherworn_misc_paper_machine3.png +0 -0
extensions/nvdiffrast/LICENSE.txt +97 -0
extensions/nvdiffrast/README.md +42 -0
extensions/nvdiffrast/nvdiffrast/__init__.py +9 -0
extensions/nvdiffrast/nvdiffrast/common/antialias.cu +558 -0
extensions/nvdiffrast/nvdiffrast/common/antialias.h +50 -0
extensions/nvdiffrast/nvdiffrast/common/common.cpp +60 -0
extensions/nvdiffrast/nvdiffrast/common/common.h +263 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+wheels/*.whl filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,321 @@

+import gradio as gr
+import spaces
+from gradio_litmodel3d import LitModel3D
+import os
+os.environ['SPCONV_ALGO'] = 'native'
+from typing import *
+import torch
+import numpy as np
+import imageio
+import uuid
+from easydict import EasyDict as edict
+from PIL import Image
+from trellis.pipelines import TrellisImageTo3DPipeline
+from trellis.representations import Gaussian, MeshExtractResult
+from trellis.utils import render_utils, postprocessing_utils
+from huggingface_hub import InferenceClient
+client = InferenceClient(api_key=os.environ["HF_API_KEY"])
+def generate_t2i_prompt(item_name):
+    llm_prompt_template = """You are tasked with creating a concise yet highly detailed description of an item to be used for generating an image in a game development pipeline. The image should show the **entire item** with no parts cropped or hidden. The background should always be plain and monocolor, with no focus on it.
+### Guidelines:
+1. **Whole Item Focus**: The description should emphasize the full item, ensuring it is clearly depicted in the image.
+2. **Concise Details**: Use vivid but compact language to describe the item's shape, materials, textures, colors, and unique features. Avoid unnecessary elaboration or context.
+3. **No Background Details**: Specify that the background is plain and monocolor without describing it further.
+### Examples:
+Item: "Golden Pocket Watch"
+A vintage golden pocket watch with intricate floral engravings, polished metal, and Roman numerals on its clock face. Its chain is smooth and reflective, completing the elegant design.
+Item: "Crystal Vase"
+A tall crystal vase with a fluted top edge, clear polished surface, and delicate floral engravings. The crystal glimmers subtly, showing off its refined craftsmanship.
+Now generate a concise description for the item: "{item_name}"
+Focus on the item itself, ensuring it is fully described, and specify a plain, white background and the output is no longer than 77 tokens.
+"""
+    messages = [
+        {
+            "role": "user",
+            "content": llm_prompt_template.format(item_name=item_name)
+        }
+    ]
+    completion = client.chat.completions.create(
+        model="Qwen/Qwen2.5-72B-Instruct",
+        messages=messages,
+        max_tokens=500
+    )
+    object_t2i_prompt = completion.choices[0].message.content
+    print(object_t2i_prompt)
+    return object_t2i_prompt
+# generate_t2i_prompt("Golden Isalmic Mosque")
+# exit()
+def generate_item_image(object_t2i_prompt):
+    image = client.text_to_image(object_t2i_prompt, model="black-forest-labs/FLUX.1-dev", width=1024, height=1024, guidance_scale=3.5, num_inference_steps=28)
+    trial_id, processed_image = preprocess_image(image)
+    return trial_id, processed_image
+MAX_SEED = np.iinfo(np.int32).max
+TMP_DIR = "/tmp/Trellis-demo"
+os.makedirs(TMP_DIR, exist_ok=True)
+def preprocess_image(image: Image.Image) -> Tuple[str, Image.Image]:
+    """
+    Preprocess the input image.
+    Args:
+        image (Image.Image): The input image.
+    Returns:
+        str: uuid of the trial.
+        Image.Image: The preprocessed image.
+    """
+    trial_id = str(uuid.uuid4())
+    processed_image = pipeline.preprocess_image(image)
+    processed_image.save(f"{TMP_DIR}/{trial_id}.png")
+    return trial_id, processed_image
+def pack_state(gs: Gaussian, mesh: MeshExtractResult, trial_id: str) -> dict:
+    return {
+        'gaussian': {
+            **gs.init_params,
+            '_xyz': gs._xyz.cpu().numpy(),
+            '_features_dc': gs._features_dc.cpu().numpy(),
+            '_scaling': gs._scaling.cpu().numpy(),
+            '_rotation': gs._rotation.cpu().numpy(),
+            '_opacity': gs._opacity.cpu().numpy(),
+        },
+        'mesh': {
+            'vertices': mesh.vertices.cpu().numpy(),
+            'faces': mesh.faces.cpu().numpy(),
+        },
+        'trial_id': trial_id,
+    }
+def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
+    gs = Gaussian(
+        aabb=state['gaussian']['aabb'],
+        sh_degree=state['gaussian']['sh_degree'],
+        mininum_kernel_size=state['gaussian']['mininum_kernel_size'],
+        scaling_bias=state['gaussian']['scaling_bias'],
+        opacity_bias=state['gaussian']['opacity_bias'],
+        scaling_activation=state['gaussian']['scaling_activation'],
+    )
+    gs._xyz = torch.tensor(state['gaussian']['_xyz'], device='cuda')
+    gs._features_dc = torch.tensor(state['gaussian']['_features_dc'], device='cuda')
+    gs._scaling = torch.tensor(state['gaussian']['_scaling'], device='cuda')
+    gs._rotation = torch.tensor(state['gaussian']['_rotation'], device='cuda')
+    gs._opacity = torch.tensor(state['gaussian']['_opacity'], device='cuda')
+    mesh = edict(
+        vertices=torch.tensor(state['mesh']['vertices'], device='cuda'),
+        faces=torch.tensor(state['mesh']['faces'], device='cuda'),
+    )
+    return gs, mesh, state['trial_id']
+@spaces.GPU
+def image_to_3d(trial_id: str, seed: int, randomize_seed: bool, ss_guidance_strength: float, ss_sampling_steps: int, slat_guidance_strength: float, slat_sampling_steps: int) -> Tuple[dict, str]:
+    """
+    Convert an image to a 3D model.
+    Args:
+        trial_id (str): The uuid of the trial.
+        seed (int): The random seed.
+        randomize_seed (bool): Whether to randomize the seed.
+        ss_guidance_strength (float): The guidance strength for sparse structure generation.
+        ss_sampling_steps (int): The number of sampling steps for sparse structure generation.
+        slat_guidance_strength (float): The guidance strength for structured latent generation.
+        slat_sampling_steps (int): The number of sampling steps for structured latent generation.
+    Returns:
+        dict: The information of the generated 3D model.
+        str: The path to the video of the 3D model.
+    """
+    if randomize_seed:
+        seed = np.random.randint(0, MAX_SEED)
+    outputs = pipeline.run(
+        Image.open(f"{TMP_DIR}/{trial_id}.png"),
+        seed=seed,
+        formats=["gaussian", "mesh"],
+        preprocess_image=False,
+        sparse_structure_sampler_params={
+            "steps": ss_sampling_steps,
+            "cfg_strength": ss_guidance_strength,
+        },
+        slat_sampler_params={
+            "steps": slat_sampling_steps,
+            "cfg_strength": slat_guidance_strength,
+        },
+    )
+    video = render_utils.render_video(outputs['gaussian'][0], num_frames=120)['color']
+    video_geo = render_utils.render_video(outputs['mesh'][0], num_frames=120)['normal']
+    video = [np.concatenate([video[i], video_geo[i]], axis=1) for i in range(len(video))]
+    trial_id = uuid.uuid4()
+    video_path = f"{TMP_DIR}/{trial_id}.mp4"
+    os.makedirs(os.path.dirname(video_path), exist_ok=True)
+    imageio.mimsave(video_path, video, fps=15)
+    state = pack_state(outputs['gaussian'][0], outputs['mesh'][0], trial_id)
+    return state, video_path
+@spaces.GPU
+def extract_glb(state: dict, mesh_simplify: float, texture_size: int) -> Tuple[str, str]:
+    """
+    Extract a GLB file from the 3D model.
+    Args:
+        state (dict): The state of the generated 3D model.
+        mesh_simplify (float): The mesh simplification factor.
+        texture_size (int): The texture resolution.
+    Returns:
+        str: The path to the extracted GLB file.
+    """
+    gs, mesh, trial_id = unpack_state(state)
+    glb = postprocessing_utils.to_glb(gs, mesh, simplify=mesh_simplify, texture_size=texture_size, verbose=False)
+    glb_path = f"{TMP_DIR}/{trial_id}.glb"
+    glb.export(glb_path)
+    return glb_path, glb_path
+def activate_button() -> gr.Button:
+    return gr.Button(interactive=True)
+def deactivate_button() -> gr.Button:
+    return gr.Button(interactive=False)
+with gr.Blocks(title="Game Items Generator") as demo:
+    gr.HTML("<h1 style='text-align: center;'>Game Items Generator</h1>")
+    gr.Markdown("""
+    ## Text or Image to 3D Asset with [TRELLIS](https://trellis3d.github.io/)
+    - Write in a very simple words the item you want for your game and click "Enhance Prompt" to generate a text-to-image prompt.
+    - Click "Generate Image" to generate an image of the item or you can bypass all of the previous steps and uplod your own image.
+    - Click "Generate 3D video" to create a 3D asset. If the image has alpha channel, it be used as the mask. Otherwise, we use `rembg` to remove the background.
+    * If you find the generated 3D asset satisfactory, click "Extract GLB" to extract the GLB file and download it.
+    """)
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                item_text_field = gr.Textbox(label="Item Name", placeholder="Enter the name of the item", lines=2, scale=4)
+                enhance_prompt_btn = gr.Button("Enhance Prompt", variant="primary", scale=1)
+            generate_image_btn = gr.Button("Generate Image", variant="primary")
+            image_prompt = gr.Image(label="Image Prompt", image_mode="RGBA", type="pil", height=300)
+            with gr.Accordion(label="Generation Settings", open=False):
+                seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
+                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+                gr.Markdown("Stage 1: Sparse Structure Generation")
+                with gr.Row():
+                    ss_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
+                    ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
+                gr.Markdown("Stage 2: Structured Latent Generation")
+                with gr.Row():
+                    slat_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=3.0, step=0.1)
+                    slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
+            generate_btn = gr.Button("Generate 3D video")
+            with gr.Accordion(label="GLB Extraction Settings", open=False):
+                mesh_simplify = gr.Slider(0.9, 0.98, label="Simplify", value=0.95, step=0.01)
+                texture_size = gr.Slider(512, 2048, label="Texture Size", value=1024, step=512)
+            extract_glb_btn = gr.Button("Extract GLB", interactive=False)
+        with gr.Column():
+            video_output = gr.Video(label="Generated 3D Asset", autoplay=True, loop=True, height=300)
+            model_output = LitModel3D(label="Extracted GLB", exposure=20.0, height=300)
+            download_glb = gr.DownloadButton(label="Download GLB", interactive=False)
+    trial_id = gr.Textbox(visible=False)
+    output_buf = gr.State()
+    # Example images at the bottom of the page
+    with gr.Row():
+        examples = gr.Examples(
+            examples=[
+                f'assets/example_image/{image}'
+                for image in os.listdir("assets/example_image")
+            ],
+            inputs=[image_prompt],
+            fn=preprocess_image,
+            outputs=[trial_id, image_prompt],
+            run_on_click=True,
+            examples_per_page=64,
+        )
+    # Handlers
+    enhance_prompt_btn.click(
+        generate_t2i_prompt,
+        inputs=[item_text_field],
+        outputs=[item_text_field],
+    )
+    generate_image_btn.click(
+        generate_item_image,
+        inputs=[item_text_field],
+        outputs=[trial_id, image_prompt],
+    )
+    image_prompt.upload(
+        preprocess_image,
+        inputs=[image_prompt],
+        outputs=[trial_id, image_prompt],
+    )
+    image_prompt.clear(
+        lambda: '',
+        outputs=[trial_id],
+    )
+    generate_btn.click(
+        image_to_3d,
+        inputs=[trial_id, seed, randomize_seed, ss_guidance_strength, ss_sampling_steps, slat_guidance_strength, slat_sampling_steps],
+        outputs=[output_buf, video_output],
+    ).then(
+        activate_button,
+        outputs=[extract_glb_btn],
+    )
+    video_output.clear(
+        deactivate_button,
+        outputs=[extract_glb_btn],
+    )
+    extract_glb_btn.click(
+        extract_glb,
+        inputs=[output_buf, mesh_simplify, texture_size],
+        outputs=[model_output, download_glb],
+    ).then(
+        activate_button,
+        outputs=[download_glb],
+    )
+    model_output.clear(
+        deactivate_button,
+        outputs=[download_glb],
+    )
+# Launch the Gradio app
+if __name__ == "__main__":
+    pipeline = TrellisImageTo3DPipeline.from_pretrained("JeffreyXiang/TRELLIS-image-large")
+    pipeline.cuda()
+    try:
+        pipeline.preprocess_image(Image.fromarray(np.zeros((512, 512, 3), dtype=np.uint8)))    # Preload rembg
+    except:
+        pass
+    demo.launch()

assets/example_image/T.png ADDED Viewed

assets/example_image/typical_building_building.png ADDED Viewed

assets/example_image/typical_building_castle.png ADDED Viewed

assets/example_image/typical_building_colorful_cottage.png ADDED Viewed

assets/example_image/typical_building_maya_pyramid.png ADDED Viewed

assets/example_image/typical_building_mushroom.png ADDED Viewed

assets/example_image/typical_building_space_station.png ADDED Viewed

assets/example_image/typical_creature_dragon.png ADDED Viewed

assets/example_image/typical_creature_elephant.png ADDED Viewed

assets/example_image/typical_creature_furry.png ADDED Viewed

assets/example_image/typical_creature_quadruped.png ADDED Viewed

assets/example_image/typical_creature_robot_crab.png ADDED Viewed

assets/example_image/typical_creature_robot_dinosour.png ADDED Viewed

assets/example_image/typical_creature_rock_monster.png ADDED Viewed

assets/example_image/typical_humanoid_block_robot.png ADDED Viewed

assets/example_image/typical_humanoid_dragonborn.png ADDED Viewed

assets/example_image/typical_humanoid_dwarf.png ADDED Viewed

assets/example_image/typical_humanoid_goblin.png ADDED Viewed

assets/example_image/typical_humanoid_mech.png ADDED Viewed

assets/example_image/typical_misc_crate.png ADDED Viewed

assets/example_image/typical_misc_fireplace.png ADDED Viewed

assets/example_image/typical_misc_gate.png ADDED Viewed

assets/example_image/typical_misc_lantern.png ADDED Viewed

assets/example_image/typical_misc_magicbook.png ADDED Viewed

assets/example_image/typical_misc_mailbox.png ADDED Viewed

assets/example_image/typical_misc_monster_chest.png ADDED Viewed

assets/example_image/typical_misc_paper_machine.png ADDED Viewed

assets/example_image/typical_misc_phonograph.png ADDED Viewed

assets/example_image/typical_misc_portal2.png ADDED Viewed

assets/example_image/typical_misc_storage_chest.png ADDED Viewed

assets/example_image/typical_misc_telephone.png ADDED Viewed

assets/example_image/typical_misc_television.png ADDED Viewed

assets/example_image/typical_misc_workbench.png ADDED Viewed

assets/example_image/typical_vehicle_biplane.png ADDED Viewed

assets/example_image/typical_vehicle_bulldozer.png ADDED Viewed

assets/example_image/typical_vehicle_cart.png ADDED Viewed

assets/example_image/typical_vehicle_excavator.png ADDED Viewed

assets/example_image/typical_vehicle_helicopter.png ADDED Viewed

assets/example_image/typical_vehicle_locomotive.png ADDED Viewed

assets/example_image/typical_vehicle_pirate_ship.png ADDED Viewed

assets/example_image/weatherworn_misc_paper_machine3.png ADDED Viewed

extensions/nvdiffrast/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,97 @@

+Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
+Nvidia Source Code License (1-Way Commercial)
+=======================================================================
+1. Definitions
+"Licensor" means any person or entity that distributes its Work.
+"Software" means the original work of authorship made available under
+this License.
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+2. License Grants
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+3. Limitations
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. The Work or
+    derivative works thereof may be used or intended for use by Nvidia
+    or its affiliates commercially or non-commercially. As used herein,
+    "non-commercially" means for research or evaluation purposes only
+    and not for any direct or indirect monetary gain.
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor's or its affiliates' names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+4. Disclaimer of Warranty.
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+5. Limitation of Liability.
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+=======================================================================

extensions/nvdiffrast/README.md ADDED Viewed

	@@ -0,0 +1,42 @@

+## Nvdiffrast &ndash; Modular Primitives for High-Performance Differentiable Rendering
+![Teaser image](./docs/img/teaser.png)
+**Modular Primitives for High-Performance Differentiable Rendering**<br>
+Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
+[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
+Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
+Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
+## Licenses
+Copyright &copy; 2020&ndash;2024, NVIDIA Corporation. All rights reserved.
+This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
+For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
+We do not currently accept outside code contributions in the form of pull requests.
+Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
+[sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
+originally shared under
+[MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
+Mesh and texture stored as part of `samples/data/earth.npz` are derived from
+[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
+model originally made available under
+[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
+## Citation
+```
+@article{Laine2020diffrast,
+  title   = {Modular Primitives for High-Performance Differentiable Rendering},
+  author  = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
+  journal = {ACM Transactions on Graphics},
+  year    = {2020},
+  volume  = {39},
+  number  = {6}
+}
+```

extensions/nvdiffrast/nvdiffrast/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+__version__ = '0.3.3'

extensions/nvdiffrast/nvdiffrast/common/antialias.cu ADDED Viewed

	@@ -0,0 +1,558 @@

+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include "antialias.h"
+//------------------------------------------------------------------------
+// Helpers.
+#define F32_MAX (3.402823466e+38f)
+static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
+static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
+static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
+{
+    bool g10 = rational_gt(n1, n0, d1, d0);
+    bool g20 = rational_gt(n2, n0, d2, d0);
+    bool g21 = rational_gt(n2, n1, d2, d1);
+    if (g20 && g21) return 2;
+    if (g10) return 1;
+    return 0;
+}
+//------------------------------------------------------------------------
+// Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
+struct AAWorkItem
+{
+    enum
+    {
+        EDGE_MASK       = 3,    // Edge index in lowest bits.
+        FLAG_DOWN_BIT   = 2,    // Down instead of right.
+        FLAG_TRI1_BIT   = 3,    // Edge is from other pixel's triangle.
+    };
+    int             px, py;         // Pixel x, y.
+    unsigned int    pz_flags;       // High 16 bits = pixel z, low 16 bits = edge index and flags.
+    float           alpha;          // Antialiasing alpha value. Zero if no AA.
+};
+//------------------------------------------------------------------------
+// Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
+#define JENKINS_MAGIC (0x9e3779b9u)
+static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
+{
+    a -= b; a -= c; a ^= (c>>13);
+    b -= c; b -= a; b ^= (a<<8);
+    c -= a; c -= b; c ^= (b>>13);
+    a -= b; a -= c; a ^= (c>>12);
+    b -= c; b -= a; b ^= (a<<16);
+    c -= a; c -= b; c ^= (b>>5);
+    a -= b; a -= c; a ^= (c>>3);
+    b -= c; b -= a; b ^= (a<<10);
+    c -= a; c -= b; c ^= (b>>15);
+}
+// Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
+class HashIndex
+{
+public:
+    __device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
+    {
+        m_mask = (p.allocTriangles << AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles)) - 1; // This should work until triangle count exceeds 1073741824.
+        m_idx  = (uint32_t)(key & 0xffffffffu);
+        m_skip = (uint32_t)(key >> 32);
+        uint32_t dummy = JENKINS_MAGIC;
+        jenkins_mix(m_idx, m_skip, dummy);
+        m_idx &= m_mask;
+        m_skip &= m_mask;
+        m_skip |= 1;
+    }
+    __device__ __forceinline__ int get(void) const { return m_idx; }
+    __device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
+private:
+    uint32_t m_idx, m_skip, m_mask;
+};
+static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
+        if (prev == 0 || prev == key)
+            break;
+        idx.next();
+    }
+    int* q = (int*)&p.evHash[idx.get()];
+    int a = atomicCAS(q+2, 0, v);
+    if (a != 0 && a != v)
+        atomicCAS(q+3, 0, v);
+}
+static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint4 entry = p.evHash[idx.get()];
+        uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
+        if (k == key || k == 0)
+            return make_int2((int)entry.z, (int)entry.w);
+        idx.next();
+    }
+}
+static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
+{
+    if (va == vb)
+        return;
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    hash_insert(p, vk, vn + 1);
+}
+static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
+{
+    if (va == vb)
+        return -1;
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    int2 vn = hash_find(p, vk) - 1;
+    if (vn.x == vr) return vn.y;
+    if (vn.y == vr) return vn.x;
+    return -1;
+}
+//------------------------------------------------------------------------
+// Mesh analysis kernel.
+__global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= p.numTriangles)
+        return;
+    int v0 = p.tri[idx * 3 + 0];
+    int v1 = p.tri[idx * 3 + 1];
+    int v2 = p.tri[idx * 3 + 2];
+    if (v0 < 0 || v0 >= p.numVertices ||
+        v1 < 0 || v1 >= p.numVertices ||
+        v2 < 0 || v2 >= p.numVertices)
+        return;
+    if (v0 == v1 || v1 == v2 || v2 == v0)
+        return;
+    evhash_insert_vertex(p, v1, v2, v0);
+    evhash_insert_vertex(p, v2, v0, v1);
+    evhash_insert_vertex(p, v0, v1, v2);
+}
+//------------------------------------------------------------------------
+// Discontinuity finder kernel.
+__global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
+    int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.n)
+        return;
+    // Pointer to our TriIdx and fetch.
+    int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
+    float tri0 = p.rasterOut[pidx0]; // These can stay as float, as we only compare them against each other.
+    // Look right, clamp at edge.
+    int pidx1 = pidx0;
+    if (px < p.width - 1)
+        pidx1 += 4;
+    float tri1 = p.rasterOut[pidx1];
+    // Look down, clamp at edge.
+    int pidx2 = pidx0;
+    if (py < p.height - 1)
+        pidx2 += p.width << 2;
+    float tri2 = p.rasterOut[pidx2];
+    // Determine amount of work.
+    int count = 0;
+    if (tri1 != tri0) count  = 1;
+    if (tri2 != tri0) count += 1;
+    if (!count)
+        return; // Exit warp.
+    // Coalesce work counter update to once per CTA.
+    __shared__ int s_temp;
+    s_temp = 0;
+    __syncthreads();
+    int idx = atomicAdd(&s_temp, count);
+    __syncthreads();
+    if (idx == 0)
+    {
+        int base = atomicAdd(&p.workBuffer[0].x, s_temp);
+        s_temp = base + 1; // don't clobber the counters in first slot.
+    }
+    __syncthreads();
+    idx += s_temp;
+    // Write to memory.
+    if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
+    if (tri2 != tri0) p.workBuffer[idx]   = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
+}
+//------------------------------------------------------------------------
+// Forward analysis kernel.
+__global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
+{
+    __shared__ int s_base;
+    int workCount = p.workBuffer[0].x;
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+        int4* pItem = p.workBuffer + thread_idx + 1;
+        int4 item = *pItem;
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d  = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
+        float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
+        int tri0 = float_to_triidx(zt0.y) - 1;
+        int tri1 = float_to_triidx(zt1.y) - 1;
+        // Select triangle based on background / depth.
+        int tri = (tri0 >= 0) ? tri0 : tri1;
+        if (tri0 >= 0 && tri1 >= 0)
+            tri = (zt0.x < zt1.x) ? tri0 : tri1;
+        if (tri == tri1)
+        {
+            // Calculate with respect to neighbor pixel if chose that triangle.
+            px += 1 - d;
+            py += d;
+        }
+        // Bail out if triangle index is corrupt.
+        if (tri < 0 || tri >= p.numTriangles)
+            continue;
+        // Fetch vertex indices.
+        int vi0 = p.tri[tri * 3 + 0];
+        int vi1 = p.tri[tri * 3 + 1];
+        int vi2 = p.tri[tri * 3 + 2];
+        // Bail out if vertex indices are corrupt.
+        if (vi0 < 0 || vi0 >= p.numVertices ||
+            vi1 < 0 || vi1 >= p.numVertices ||
+            vi2 < 0 || vi2 >= p.numVertices)
+            continue;
+        // Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
+        int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
+        int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
+        int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            int vbase = pz * p.numVertices;
+            vi0 += vbase;
+            vi1 += vbase;
+            vi2 += vbase;
+            if (op0 >= 0) op0 += vbase;
+            if (op1 >= 0) op1 += vbase;
+            if (op2 >= 0) op2 += vbase;
+        }
+        // Fetch vertex positions.
+        float4 p0 = ((float4*)p.pos)[vi0];
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+        float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
+        float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
+        float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
+        // Project vertices to pixel space.
+        float w0  = 1.f / p0.w;
+        float w1  = 1.f / p1.w;
+        float w2  = 1.f / p2.w;
+        float ow0 = 1.f / o0.w;
+        float ow1 = 1.f / o1.w;
+        float ow2 = 1.f / o2.w;
+        float fx  = (float)px + .5f - p.xh;
+        float fy  = (float)py + .5f - p.yh;
+        float x0  = p0.x * w0 * p.xh - fx;
+        float y0  = p0.y * w0 * p.yh - fy;
+        float x1  = p1.x * w1 * p.xh - fx;
+        float y1  = p1.y * w1 * p.yh - fy;
+        float x2  = p2.x * w2 * p.xh - fx;
+        float y2  = p2.y * w2 * p.yh - fy;
+        float ox0 = o0.x * ow0 * p.xh - fx;
+        float oy0 = o0.y * ow0 * p.yh - fy;
+        float ox1 = o1.x * ow1 * p.xh - fx;
+        float oy1 = o1.y * ow1 * p.yh - fy;
+        float ox2 = o2.x * ow2 * p.xh - fx;
+        float oy2 = o2.y * ow2 * p.yh - fy;
+        // Signs to kill non-silhouette edges.
+        float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
+        float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
+        float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
+        float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
+        // If no matching signs anywhere, skip the rest.
+        if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
+        {
+            // XY flip for horizontal edges.
+            if (d)
+            {
+                swap(x0, y0);
+                swap(x1, y1);
+                swap(x2, y2);
+            }
+            float dx0 = x2 - x1;
+            float dx1 = x0 - x2;
+            float dx2 = x1 - x0;
+            float dy0 = y2 - y1;
+            float dy1 = y0 - y2;
+            float dy2 = y1 - y0;
+            // Check if an edge crosses between us and the neighbor pixel.
+            float dc = -F32_MAX;
+            float ds = (tri == tri0) ? 1.f : -1.f;
+            float d0 = ds * (x1*dy0 - y1*dx0);
+            float d1 = ds * (x2*dy1 - y2*dx1);
+            float d2 = ds * (x0*dy2 - y0*dx2);
+            if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
+            if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
+            if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
+            int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
+            if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
+            if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
+            if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
+            float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
+            // Adjust output image if a suitable edge was found.
+            if (dc > -eps && dc < 1.f + eps)
+            {
+                dc = fminf(fmaxf(dc, 0.f), 1.f);
+                float alpha = ds * (.5f - dc);
+                const float* pColor0 = p.color + pixel0 * p.channels;
+                const float* pColor1 = p.color + pixel1 * p.channels;
+                float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+                for (int i=0; i < p.channels; i++)
+                    atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
+                // Rewrite the work item's flags and alpha. Keep original px, py.
+                unsigned int flags = pz << 16;
+                flags |= di;
+                flags |= d << AAWorkItem::FLAG_DOWN_BIT;
+                flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
+                ((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
+            }
+        }
+    }
+}
+//------------------------------------------------------------------------
+// Gradient kernel.
+__global__ void AntialiasGradKernel(const AntialiasKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+    __shared__ int s_base; // Work counter communication across entire CTA.
+    int workCount = p.workBuffer[0].x;
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+        // Read work item filled out by forward kernel.
+        int4 item = p.workBuffer[thread_idx + 1];
+        unsigned int amask = __ballot_sync(0xffffffffu, item.w);
+        if (item.w == 0)
+            continue; // No effect.
+        // Unpack work item and replicate setup from forward analysis kernel.
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+        float alpha = __int_as_float(item.w);
+        int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
+        int di = item.z & AAWorkItem::EDGE_MASK;
+        float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        int tri = float_to_triidx(p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3]) - 1;
+        if (tri1)
+        {
+            px += 1 - d;
+            py += d;
+        }
+        // Bail out if triangle index is corrupt.
+        bool triFail = (tri < 0 || tri >= p.numTriangles);
+        amask = __ballot_sync(amask, !triFail);
+        if (triFail)
+            continue;
+        // Outgoing color gradients.
+        float* pGrad0 = p.gradColor + pixel0 * p.channels;
+        float* pGrad1 = p.gradColor + pixel1 * p.channels;
+        // Incoming color gradients.
+        const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+        // Position gradient weight based on colors and incoming gradients.
+        float dd = 0.f;
+        const float* pColor0 = p.color + pixel0 * p.channels;
+        const float* pColor1 = p.color + pixel1 * p.channels;
+        // Loop over channels and accumulate.
+        for (int i=0; i < p.channels; i++)
+        {
+            float dy = pDy[i];
+            if (dy != 0.f)
+            {
+                // Update position gradient weight.
+                dd += dy * (pColor1[i] - pColor0[i]);
+                // Update color gradients. No coalescing because all have different targets.
+                float v = alpha * dy;
+                atomicAdd(&pGrad0[i], -v);
+                atomicAdd(&pGrad1[i], v);
+            }
+        }
+        // If position weight is zero, skip the rest.
+        bool noGrad = (dd == 0.f);
+        amask = __ballot_sync(amask, !noGrad);
+        if (noGrad)
+            continue;
+        // Fetch vertex indices of the active edge and their positions.
+        int i1 = (di < 2) ? (di + 1) : 0;
+        int i2 = (i1 < 2) ? (i1 + 1) : 0;
+        int vi1 = p.tri[3 * tri + i1];
+        int vi2 = p.tri[3 * tri + i2];
+        // Bail out if vertex indices are corrupt.
+        bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
+        amask = __ballot_sync(amask, !vtxFail);
+        if (vtxFail)
+            continue;
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            vi1 += pz * p.numVertices;
+            vi2 += pz * p.numVertices;
+        }
+        // Fetch vertex positions.
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+        // Project vertices to pixel space.
+        float pxh = p.xh;
+        float pyh = p.yh;
+        float fx = (float)px + .5f - pxh;
+        float fy = (float)py + .5f - pyh;
+        // XY flip for horizontal edges.
+        if (d)
+        {
+            swap(p1.x, p1.y);
+            swap(p2.x, p2.y);
+            swap(pxh, pyh);
+            swap(fx, fy);
+        }
+        // Gradient calculation setup.
+        float w1 = 1.f / p1.w;
+        float w2 = 1.f / p2.w;
+        float x1 = p1.x * w1 * pxh - fx;
+        float y1 = p1.y * w1 * pyh - fy;
+        float x2 = p2.x * w2 * pxh - fx;
+        float y2 = p2.y * w2 * pyh - fy;
+        float dx = x2 - x1;
+        float dy = y2 - y1;
+        float db = x1*dy - y1*dx;
+        // Compute inverse delta-y with epsilon.
+        float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
+        float iy = 1.f / (dy + ep);
+        // Compute position gradients.
+        float dby = db * iy;
+        float iw1 = -w1 * iy * dd;
+        float iw2 =  w2 * iy * dd;
+        float gp1x = iw1 * pxh * y2;
+        float gp2x = iw2 * pxh * y1;
+        float gp1y = iw1 * pyh * (dby - x2);
+        float gp2y = iw2 * pyh * (dby - x1);
+        float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
+        float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
+        // XY flip the gradients.
+        if (d)
+        {
+            swap(gp1x, gp1y);
+            swap(gp2x, gp2y);
+        }
+        // Kill position gradients if alpha was saturated.
+        if (fabsf(alpha) >= 0.5f)
+        {
+            gp1x = gp1y = gp1w = 0.f;
+            gp2x = gp2y = gp2w = 0.f;
+        }
+        // Initialize coalesced atomics. Match both triangle ID and edge index.
+        // Also note that some threads may be inactive.
+        CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
+        // Accumulate gradients.
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
+    }
+}
+//------------------------------------------------------------------------

extensions/nvdiffrast/nvdiffrast/common/antialias.h ADDED Viewed

	@@ -0,0 +1,50 @@

+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#pragma once
+#include "common.h"
+//------------------------------------------------------------------------
+// Constants and helpers.
+#define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH         32
+#define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT        8
+#define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK        256
+#define AA_MESH_KERNEL_THREADS_PER_BLOCK            256
+#define AA_HASH_ELEMENTS_PER_TRIANGLE(alloc)        ((alloc) >= (2 << 25) ? 4 : 8) // With more than 16777216 triangles (alloc >= 33554432) use smallest possible value of 4 to conserve memory, otherwise use 8 for fewer collisions.
+#define AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(alloc)    ((alloc) >= (2 << 25) ? 2 : 3)
+#define AA_GRAD_KERNEL_THREADS_PER_BLOCK            256
+//------------------------------------------------------------------------
+// CUDA kernel params.
+struct AntialiasKernelParams
+{
+    const float*    color;          // Incoming color buffer.
+    const float*    rasterOut;      // Incoming rasterizer output buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    pos;            // Incoming position buffer.
+    float*          output;         // Output buffer of forward kernel.
+    const float*    dy;             // Incoming gradients.
+    float*          gradColor;      // Output buffer, color gradient.
+    float*          gradPos;        // Output buffer, position gradient.
+    int4*           workBuffer;     // Buffer for storing intermediate work items. First item reserved for counters.
+    uint4*          evHash;         // Edge-vertex hash.
+    int             allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Input width.
+    int             height;         // Input height.
+    int             n;              // Minibatch size.
+    int             channels;       // Channel count in color input.
+    float           xh, yh;         // Transfer to pixel space.
+    int             instance_mode;  // 0=normal, 1=instance mode.
+    int             tri_const;      // 1 if triangle array is known to be constant.
+};
+//------------------------------------------------------------------------

extensions/nvdiffrast/nvdiffrast/common/common.cpp ADDED Viewed

	@@ -0,0 +1,60 @@

+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include <cuda_runtime.h>
+//------------------------------------------------------------------------
+// Block and grid size calculators for kernel launches.
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
+{
+    int maxThreads = maxWidth * maxHeight;
+    if (maxThreads <= 1 || (width * height) <= 1)
+        return dim3(1, 1, 1); // Degenerate.
+    // Start from max size.
+    int bw = maxWidth;
+    int bh = maxHeight;
+    // Optimizations for weirdly sized buffers.
+    if (width < bw)
+    {
+        // Decrease block width to smallest power of two that covers the buffer width.
+        while ((bw >> 1) >= width)
+            bw >>= 1;
+        // Maximize height.
+        bh = maxThreads / bw;
+        if (bh > height)
+            bh = height;
+    }
+    else if (height < bh)
+    {
+        // Halve height and double width until fits completely inside buffer vertically.
+        while (bh > height)
+        {
+            bh >>= 1;
+            if (bw < width)
+                bw <<= 1;
+        }
+    }
+    // Done.
+    return dim3(bw, bh, 1);
+}
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
+{
+    dim3 gridSize;
+    gridSize.x = (width  - 1) / blockSize.x + 1;
+    gridSize.y = (height - 1) / blockSize.y + 1;
+    gridSize.z = (depth  - 1) / blockSize.z + 1;
+    return gridSize;
+}
+//------------------------------------------------------------------------

extensions/nvdiffrast/nvdiffrast/common/common.h ADDED Viewed

	@@ -0,0 +1,263 @@

+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#pragma once
+#include <cuda.h>
+#include <stdint.h>
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height);
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth);
+//------------------------------------------------------------------------
+// The rest is CUDA device code specific stuff.
+#ifdef __CUDACC__
+//------------------------------------------------------------------------
+// Helpers for CUDA vector types.
+static __device__ __forceinline__ float2&   operator*=  (float2& a, const float2& b)       { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ float2&   operator+=  (float2& a, const float2& b)       { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ float2&   operator-=  (float2& a, const float2& b)       { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ float2&   operator*=  (float2& a, float b)               { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ float2&   operator+=  (float2& a, float b)               { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ float2&   operator-=  (float2& a, float b)               { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ float2    operator*   (const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ float2    operator+   (const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ float2    operator-   (const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ float2    operator*   (const float2& a, float b)         { return make_float2(a.x * b, a.y * b); }
+static __device__ __forceinline__ float2    operator+   (const float2& a, float b)         { return make_float2(a.x + b, a.y + b); }
+static __device__ __forceinline__ float2    operator-   (const float2& a, float b)         { return make_float2(a.x - b, a.y - b); }
+static __device__ __forceinline__ float2    operator*   (float a, const float2& b)         { return make_float2(a * b.x, a * b.y); }
+static __device__ __forceinline__ float2    operator+   (float a, const float2& b)         { return make_float2(a + b.x, a + b.y); }
+static __device__ __forceinline__ float2    operator-   (float a, const float2& b)         { return make_float2(a - b.x, a - b.y); }
+static __device__ __forceinline__ float2    operator-   (const float2& a)                  { return make_float2(-a.x, -a.y); }
+static __device__ __forceinline__ float3&   operator*=  (float3& a, const float3& b)       { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ float3&   operator+=  (float3& a, const float3& b)       { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ float3&   operator-=  (float3& a, const float3& b)       { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ float3&   operator*=  (float3& a, float b)               { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ float3&   operator+=  (float3& a, float b)               { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ float3&   operator-=  (float3& a, float b)               { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ float3    operator*   (const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ float3    operator+   (const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ float3    operator-   (const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ float3    operator*   (const float3& a, float b)         { return make_float3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ float3    operator+   (const float3& a, float b)         { return make_float3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ float3    operator-   (const float3& a, float b)         { return make_float3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ float3    operator*   (float a, const float3& b)         { return make_float3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ float3    operator+   (float a, const float3& b)         { return make_float3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ float3    operator-   (float a, const float3& b)         { return make_float3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ float3    operator-   (const float3& a)                  { return make_float3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ float4&   operator*=  (float4& a, const float4& b)       { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ float4&   operator+=  (float4& a, const float4& b)       { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ float4&   operator-=  (float4& a, const float4& b)       { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ float4&   operator*=  (float4& a, float b)               { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ float4&   operator+=  (float4& a, float b)               { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ float4&   operator-=  (float4& a, float b)               { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ float4    operator*   (const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ float4    operator+   (const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ float4    operator-   (const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ float4    operator*   (const float4& a, float b)         { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ float4    operator+   (const float4& a, float b)         { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ float4    operator-   (const float4& a, float b)         { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ float4    operator*   (float a, const float4& b)         { return make_float4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ float4    operator+   (float a, const float4& b)         { return make_float4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ float4    operator-   (float a, const float4& b)         { return make_float4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ float4    operator-   (const float4& a)                  { return make_float4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ int2&     operator*=  (int2& a, const int2& b)           { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ int2&     operator+=  (int2& a, const int2& b)           { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ int2&     operator-=  (int2& a, const int2& b)           { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ int2&     operator*=  (int2& a, int b)                   { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ int2&     operator+=  (int2& a, int b)                   { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ int2&     operator-=  (int2& a, int b)                   { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ int2      operator*   (const int2& a, const int2& b)     { return make_int2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ int2      operator+   (const int2& a, const int2& b)     { return make_int2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ int2      operator-   (const int2& a, const int2& b)     { return make_int2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ int2      operator*   (const int2& a, int b)             { return make_int2(a.x * b, a.y * b); }
+static __device__ __forceinline__ int2      operator+   (const int2& a, int b)             { return make_int2(a.x + b, a.y + b); }
+static __device__ __forceinline__ int2      operator-   (const int2& a, int b)             { return make_int2(a.x - b, a.y - b); }
+static __device__ __forceinline__ int2      operator*   (int a, const int2& b)             { return make_int2(a * b.x, a * b.y); }
+static __device__ __forceinline__ int2      operator+   (int a, const int2& b)             { return make_int2(a + b.x, a + b.y); }
+static __device__ __forceinline__ int2      operator-   (int a, const int2& b)             { return make_int2(a - b.x, a - b.y); }
+static __device__ __forceinline__ int2      operator-   (const int2& a)                    { return make_int2(-a.x, -a.y); }
+static __device__ __forceinline__ int3&     operator*=  (int3& a, const int3& b)           { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ int3&     operator+=  (int3& a, const int3& b)           { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ int3&     operator-=  (int3& a, const int3& b)           { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ int3&     operator*=  (int3& a, int b)                   { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ int3&     operator+=  (int3& a, int b)                   { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ int3&     operator-=  (int3& a, int b)                   { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ int3      operator*   (const int3& a, const int3& b)     { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ int3      operator+   (const int3& a, const int3& b)     { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ int3      operator-   (const int3& a, const int3& b)     { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ int3      operator*   (const int3& a, int b)             { return make_int3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ int3      operator+   (const int3& a, int b)             { return make_int3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ int3      operator-   (const int3& a, int b)             { return make_int3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ int3      operator*   (int a, const int3& b)             { return make_int3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ int3      operator+   (int a, const int3& b)             { return make_int3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ int3      operator-   (int a, const int3& b)             { return make_int3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ int3      operator-   (const int3& a)                    { return make_int3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ int4&     operator*=  (int4& a, const int4& b)           { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ int4&     operator+=  (int4& a, const int4& b)           { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ int4&     operator-=  (int4& a, const int4& b)           { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ int4&     operator*=  (int4& a, int b)                   { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ int4&     operator+=  (int4& a, int b)                   { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ int4&     operator-=  (int4& a, int b)                   { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ int4      operator*   (const int4& a, const int4& b)     { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ int4      operator+   (const int4& a, const int4& b)     { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ int4      operator-   (const int4& a, const int4& b)     { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ int4      operator*   (const int4& a, int b)             { return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ int4      operator+   (const int4& a, int b)             { return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ int4      operator-   (const int4& a, int b)             { return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ int4      operator*   (int a, const int4& b)             { return make_int4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ int4      operator+   (int a, const int4& b)             { return make_int4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ int4      operator-   (int a, const int4& b)             { return make_int4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ int4      operator-   (const int4& a)                    { return make_int4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ uint2&    operator*=  (uint2& a, const uint2& b)         { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ uint2&    operator+=  (uint2& a, const uint2& b)         { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ uint2&    operator-=  (uint2& a, const uint2& b)         { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ uint2&    operator*=  (uint2& a, unsigned int b)         { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ uint2&    operator+=  (uint2& a, unsigned int b)         { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ uint2&    operator-=  (uint2& a, unsigned int b)         { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ uint2     operator*   (const uint2& a, const uint2& b)   { return make_uint2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ uint2     operator+   (const uint2& a, const uint2& b)   { return make_uint2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ uint2     operator-   (const uint2& a, const uint2& b)   { return make_uint2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ uint2     operator*   (const uint2& a, unsigned int b)   { return make_uint2(a.x * b, a.y * b); }
+static __device__ __forceinline__ uint2     operator+   (const uint2& a, unsigned int b)   { return make_uint2(a.x + b, a.y + b); }
+static __device__ __forceinline__ uint2     operator-   (const uint2& a, unsigned int b)   { return make_uint2(a.x - b, a.y - b); }
+static __device__ __forceinline__ uint2     operator*   (unsigned int a, const uint2& b)   { return make_uint2(a * b.x, a * b.y); }
+static __device__ __forceinline__ uint2     operator+   (unsigned int a, const uint2& b)   { return make_uint2(a + b.x, a + b.y); }
+static __device__ __forceinline__ uint2     operator-   (unsigned int a, const uint2& b)   { return make_uint2(a - b.x, a - b.y); }
+static __device__ __forceinline__ uint3&    operator*=  (uint3& a, const uint3& b)         { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ uint3&    operator+=  (uint3& a, const uint3& b)         { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ uint3&    operator-=  (uint3& a, const uint3& b)         { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ uint3&    operator*=  (uint3& a, unsigned int b)         { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ uint3&    operator+=  (uint3& a, unsigned int b)         { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ uint3&    operator-=  (uint3& a, unsigned int b)         { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ uint3     operator*   (const uint3& a, const uint3& b)   { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ uint3     operator+   (const uint3& a, const uint3& b)   { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ uint3     operator-   (const uint3& a, const uint3& b)   { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ uint3     operator*   (const uint3& a, unsigned int b)   { return make_uint3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ uint3     operator+   (const uint3& a, unsigned int b)   { return make_uint3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ uint3     operator-   (const uint3& a, unsigned int b)   { return make_uint3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ uint3     operator*   (unsigned int a, const uint3& b)   { return make_uint3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ uint3     operator+   (unsigned int a, const uint3& b)   { return make_uint3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ uint3     operator-   (unsigned int a, const uint3& b)   { return make_uint3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ uint4&    operator*=  (uint4& a, const uint4& b)         { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ uint4&    operator+=  (uint4& a, const uint4& b)         { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ uint4&    operator-=  (uint4& a, const uint4& b)         { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ uint4&    operator*=  (uint4& a, unsigned int b)         { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ uint4&    operator+=  (uint4& a, unsigned int b)         { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ uint4&    operator-=  (uint4& a, unsigned int b)         { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ uint4     operator*   (const uint4& a, const uint4& b)   { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ uint4     operator+   (const uint4& a, const uint4& b)   { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ uint4     operator-   (const uint4& a, const uint4& b)   { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ uint4     operator*   (const uint4& a, unsigned int b)   { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ uint4     operator+   (const uint4& a, unsigned int b)   { return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ uint4     operator-   (const uint4& a, unsigned int b)   { return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ uint4     operator*   (unsigned int a, const uint4& b)   { return make_uint4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ uint4     operator+   (unsigned int a, const uint4& b)   { return make_uint4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ uint4     operator-   (unsigned int a, const uint4& b)   { return make_uint4(a - b.x, a - b.y, a - b.z, a - b.w); }
+template<class T> static __device__ __forceinline__ T zero_value(void);
+template<> __device__ __forceinline__ float  zero_value<float> (void)                      { return 0.f; }
+template<> __device__ __forceinline__ float2 zero_value<float2>(void)                      { return make_float2(0.f, 0.f); }
+template<> __device__ __forceinline__ float4 zero_value<float4>(void)                      { return make_float4(0.f, 0.f, 0.f, 0.f); }
+static __device__ __forceinline__ float3 make_float3(const float2& a, float b)             { return make_float3(a.x, a.y, b); }
+static __device__ __forceinline__ float4 make_float4(const float3& a, float b)             { return make_float4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ float4 make_float4(const float2& a, const float2& b)     { return make_float4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ int3 make_int3(const int2& a, int b)                     { return make_int3(a.x, a.y, b); }
+static __device__ __forceinline__ int4 make_int4(const int3& a, int b)                     { return make_int4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ int4 make_int4(const int2& a, const int2& b)             { return make_int4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ uint3 make_uint3(const uint2& a, unsigned int b)         { return make_uint3(a.x, a.y, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint3& a, unsigned int b)         { return make_uint4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint2& a, const uint2& b)         { return make_uint4(a.x, a.y, b.x, b.y); }
+template<class T> static __device__ __forceinline__ void swap(T& a, T& b)                  { T temp = a; a = b; b = temp; }
+//------------------------------------------------------------------------
+// Triangle ID <-> float32 conversion functions to support very large triangle IDs.
+//
+// Values up to and including 16777216 (also, negative values) are converted trivially and retain
+// compatibility with previous versions. Larger values are mapped to unique float32 that are not equal to
+// the ID. The largest value that converts to float32 and back without generating inf or nan is 889192447.
+static __device__ __forceinline__ int   float_to_triidx(float x) { if (x <= 16777216.f) return (int)x;   return __float_as_int(x) - 0x4a800000; }
+static __device__ __forceinline__ float triidx_to_float(int x)   { if (x <= 0x01000000) return (float)x; return __int_as_float(0x4a800000 + x); }
+//------------------------------------------------------------------------
+// Coalesced atomics. These are all done via macros.
+#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
+#define CA_TEMP       _ca_temp
+#define CA_TEMP_PARAM float* CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) \
+    __shared__ float CA_TEMP[(threads_per_block)]
+#define CA_SET_GROUP_MASK(group, thread_mask)                   \
+    bool   _ca_leader;                                          \
+    float* _ca_ptr;                                             \
+    do {                                                        \
+        int tidx   = threadIdx.x + blockDim.x * threadIdx.y;    \
+        int lane   = tidx & 31;                                 \
+        int warp   = tidx >> 5;                                 \
+        int tmask  = __match_any_sync((thread_mask), (group));  \
+        int leader = __ffs(tmask) - 1;                          \
+        _ca_leader = (leader == lane);                          \
+        _ca_ptr    = &_ca_temp[((warp << 5) + leader)];         \
+    } while(0)
+#define CA_SET_GROUP(group) \
+    CA_SET_GROUP_MASK((group), 0xffffffffu)
+#define caAtomicAdd(ptr, value)         \
+    do {                                \
+        if (_ca_leader)                 \
+            *_ca_ptr = 0.f;             \
+        atomicAdd(_ca_ptr, (value));    \
+        if (_ca_leader)                 \
+            atomicAdd((ptr), *_ca_ptr); \
+    } while(0)
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        caAtomicAdd((ptr), (x));        \
+        caAtomicAdd((ptr)+1, (y));      \
+        caAtomicAdd((ptr)+3, (w));      \
+    } while(0)
+#define caAtomicAddTexture(ptr, level, idx, value)  \
+    do {                                            \
+        CA_SET_GROUP((idx) ^ ((level) << 27));      \
+        caAtomicAdd((ptr)+(idx), (value));          \
+    } while(0)
+//------------------------------------------------------------------------
+// Disable atomic coalescing for compute capability lower than 7.x
+#else // __CUDA_ARCH__ >= 700
+#define CA_TEMP _ca_temp
+#define CA_TEMP_PARAM float CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
+#define CA_SET_GROUP_MASK(group, thread_mask)
+#define CA_SET_GROUP(group)
+#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        atomicAdd((ptr), (x));          \
+        atomicAdd((ptr)+1, (y));        \
+        atomicAdd((ptr)+3, (w));        \
+    } while(0)
+#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
+#endif // __CUDA_ARCH__ >= 700
+//------------------------------------------------------------------------
+#endif // __CUDACC__