Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .github/ci_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,41 @@ platforms:
- name: test
run: pytest tests/ --devices cambricon -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml

hygon:
runner_label: Hygon
execution_mode: agent_local
image:
dockerfile: images/hygon/
build_args:
BASE_IMAGE: image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04.1-py3.10
APT_MIRROR: http://archive.ubuntu.com/ubuntu
PIP_INDEX_URL: https://pypi.org/simple
docker_args:
- "--privileged"
- "--network=host"
- "--ipc=host"
- "--device=/dev/kfd"
- "--device=/dev/mkfd"
- "--device=/dev/dri"
- "--group-add=video"
volumes:
- /opt/hyhal:/opt/hyhal:ro
setup: pip install .[dev] --no-build-isolation
jobs:
gpu:
type: unittest
resources:
ngpus: 1
gpu_style: none
memory: 32GB
shm_size: 64g
timeout: 3600
queue_timeout: 600
junit_path: test-results.xml
stages:
- name: test
run: pytest tests/ --devices hygon -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml

ascend:
runner_label: Ascend
execution_mode: agent_local
Expand Down
129 changes: 126 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ set(PYBIND11_ENABLE_EXTRAS ON)
option(WITH_CPU "Enable CPU backend" OFF)
option(WITH_NVIDIA "Enable CUDA backend" OFF)
option(WITH_ILUVATAR "Enable Iluvatar GPU backend" OFF)
option(WITH_HYGON "Enable Hygon GPU backend" OFF)
option(WITH_METAX "Enable MetaX backend" OFF)
option(WITH_CAMBRICON "Enable Cambricon backend" OFF)
option(WITH_MOORE "Enable Moore backend" OFF)
Expand All @@ -29,6 +30,31 @@ option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF)
option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF)

set(_DEFAULT_HYGON_DTK_ROOT "/opt/dtk")

function(_infiniops_find_hygon_cuda_root out_var dtk_root)
set(_candidates
"${dtk_root}/cuda"
"${dtk_root}/cuda/cuda"
)

file(GLOB _versioned_cuda_dirs LIST_DIRECTORIES true "${dtk_root}/cuda/cuda-*")
if(_versioned_cuda_dirs)
list(SORT _versioned_cuda_dirs)
list(REVERSE _versioned_cuda_dirs)
list(APPEND _candidates ${_versioned_cuda_dirs})
endif()

foreach(_candidate IN LISTS _candidates)
if(EXISTS "${_candidate}/bin/nvcc")
set(${out_var} "${_candidate}" PARENT_SCOPE)
return()
endif()
endforeach()

set(${out_var} "" PARENT_SCOPE)
endfunction()

if(AUTO_DETECT_DEVICES)
message(STATUS "Auto-detecting available devices...")

Expand All @@ -48,6 +74,24 @@ if(AUTO_DETECT_DEVICES)
message(STATUS "Auto-detected Iluvatar environment.")
endif()

set(_hygon_detected FALSE)
if(DEFINED ENV{DTK_ROOT} AND NOT "$ENV{DTK_ROOT}" STREQUAL "")
_infiniops_find_hygon_cuda_root(_HYGON_CUDA_DETECT_ROOT "$ENV{DTK_ROOT}")
if(_HYGON_CUDA_DETECT_ROOT)
set(_hygon_detected TRUE)
endif()
else()
_infiniops_find_hygon_cuda_root(_HYGON_CUDA_DETECT_ROOT "${_DEFAULT_HYGON_DTK_ROOT}")
if(_HYGON_CUDA_DETECT_ROOT)
set(_hygon_detected TRUE)
endif()
endif()

if(_hygon_detected)
set(WITH_HYGON ON)
message(STATUS "Auto-detected Hygon environment.")
endif()

if(DEFINED ENV{MACA_PATH})
set(WITH_METAX ON)
message(STATUS "Auto-detected MetaX environment from MACA_PATH")
Expand Down Expand Up @@ -172,6 +216,17 @@ if(WITH_TORCH)
OUTPUT_STRIP_TRAILING_WHITESPACE
)

execute_process(
COMMAND ${Python_EXECUTABLE} -c "import pathlib, torch; p = pathlib.Path(torch.__file__).resolve().parent.parent / 'torch.libs'; print(str(p) if p.exists() else '')"
OUTPUT_VARIABLE _torch_private_lib_dir
OUTPUT_STRIP_TRAILING_WHITESPACE
)

set(TORCH_RUNTIME_DIRS ${_torch_lib_dirs})
if(_torch_private_lib_dir)
list(APPEND TORCH_RUNTIME_DIRS ${_torch_private_lib_dir})
endif()

find_library(TORCH_LIB torch HINTS ${_torch_lib_dirs} REQUIRED)
find_library(TORCH_CPU_LIB torch_cpu HINTS ${_torch_lib_dirs} REQUIRED)
find_library(C10_LIB c10 HINTS ${_torch_lib_dirs} REQUIRED)
Expand Down Expand Up @@ -221,14 +276,14 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)

# Only one CUDA-like GPU backend can be enabled at a time.
set(_gpu_backend_count 0)
foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_METAX WITH_MOORE WITH_ASCEND)
foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_HYGON WITH_METAX WITH_MOORE WITH_ASCEND)
if(${_gpu_backend})
math(EXPR _gpu_backend_count "${_gpu_backend_count} + 1")
endif()
endforeach()

if(_gpu_backend_count GREATER 1)
message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_METAX`, `WITH_MOORE`, and `WITH_ASCEND` are mutually exclusive. Build one GPU backend at a time.")
message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_HYGON`, `WITH_METAX`, `WITH_MOORE`, and `WITH_ASCEND` are mutually exclusive. Build one GPU backend at a time.")
endif()

if(WITH_NVIDIA)
Expand Down Expand Up @@ -267,6 +322,70 @@ if(WITH_ILUVATAR)
add_compile_options($<$<COMPILE_LANGUAGE:CUDA>:-x$<SEMICOLON>ivcore>)
endif()

if(WITH_HYGON)
add_compile_definitions(WITH_HYGON=1)
set(DTK_ROOT $ENV{DTK_ROOT})
if(NOT DTK_ROOT)
set(DTK_ROOT "${_DEFAULT_HYGON_DTK_ROOT}")
endif()
if(NOT EXISTS "${DTK_ROOT}")
message(FATAL_ERROR "`WITH_HYGON` is `ON` but `DTK_ROOT` (`${DTK_ROOT}`) does not exist.")
endif()

set(_HYGON_ARCH_DEFAULT "gfx906")
if(DEFINED ENV{HYGON_ARCH} AND NOT "$ENV{HYGON_ARCH}" STREQUAL "")
set(_HYGON_ARCH_DEFAULT "$ENV{HYGON_ARCH}")
else()
find_program(HYGON_ROCMINFO_EXECUTABLE NAMES rocminfo HINTS "${DTK_ROOT}/bin")
if(HYGON_ROCMINFO_EXECUTABLE)
execute_process(
COMMAND ${HYGON_ROCMINFO_EXECUTABLE}
OUTPUT_VARIABLE _HYGON_ROCMINFO_OUTPUT
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE
)
string(REGEX MATCH "gfx[0-9]+" _HYGON_ARCH_AUTO "${_HYGON_ROCMINFO_OUTPUT}")
if(_HYGON_ARCH_AUTO)
set(_HYGON_ARCH_DEFAULT "${_HYGON_ARCH_AUTO}")
endif()
endif()
endif()

set(HYGON_ARCH "${_HYGON_ARCH_DEFAULT}" CACHE STRING "Hygon GPU architecture")
_infiniops_find_hygon_cuda_root(HYGON_CUDA_ROOT "${DTK_ROOT}")

if(NOT HYGON_CUDA_ROOT)
message(FATAL_ERROR "`WITH_HYGON` is `ON` but no DTK `nvcc` was found under `${DTK_ROOT}`. Checked `${DTK_ROOT}/cuda/bin/nvcc`, `${DTK_ROOT}/cuda/cuda/bin/nvcc`, and `${DTK_ROOT}/cuda/cuda-*/bin/nvcc`.")
endif()

set(CMAKE_CUDA_COMPILER "${HYGON_CUDA_ROOT}/bin/nvcc" CACHE FILEPATH "Hygon CUDA compiler (DTK nvcc)")
set(CUDAToolkit_ROOT "${HYGON_CUDA_ROOT}" CACHE PATH "Hygon CUDA toolkit root")
set(CMAKE_CUDA_ARCHITECTURES OFF CACHE STRING "Disable default CUDA arch flags for Hygon" FORCE)
set(CMAKE_CUDA_FLAGS "-std=c++17 -fPIC -arch=${HYGON_ARCH} -Wno-return-type -Wno-error=unused-private-field" CACHE STRING "Hygon CUDA flags")
set(CMAKE_CUDA_SEPARABLE_COMPILATION OFF CACHE BOOL "Disable RDC for Hygon")

# DTK's nvcc wrapper derives its toolkit root from `CUDA_PATH`.
set(ENV{CUDA_PATH} "${HYGON_CUDA_ROOT}")
set(ENV{CUDA_HOME} "${HYGON_CUDA_ROOT}")

# DTK's nvcc wrapper may invoke `nvcc` by name during compiler checks.
set(ENV{PATH} "${HYGON_CUDA_ROOT}/bin:$ENV{PATH}")

# The actual Ninja build runs in fresh processes. Keep a launcher command
# for CUDA-backed Python bindings that need the DTK wrapper environment.
set(_HYGON_RULE_LAUNCH_ENV
"${CMAKE_COMMAND} -E env CUDA_PATH=${HYGON_CUDA_ROOT} CUDA_HOME=${HYGON_CUDA_ROOT} PATH=${HYGON_CUDA_ROOT}/bin:$ENV{PATH}")

include_directories("${DTK_ROOT}/include")
include_directories("${HYGON_CUDA_ROOT}/include")
link_directories("${DTK_ROOT}/lib")
link_directories("${HYGON_CUDA_ROOT}/lib64")

message(STATUS "Hygon: CUDA compiler ${CMAKE_CUDA_COMPILER}, arch ${HYGON_ARCH}, DTK root ${DTK_ROOT}")
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
endif()

if(WITH_METAX)
add_compile_definitions(WITH_METAX=1)

Expand Down Expand Up @@ -350,14 +469,18 @@ if(WITH_ASCEND)
endif()

# If all other platforms are not enabled, CPU is enabled by default.
if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON AND NOT WITH_ASCEND)
if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_HYGON AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON AND NOT WITH_ASCEND)
add_compile_definitions(WITH_CPU=1)
endif()

if(WITH_TORCH OR WITH_METAX OR WITH_MOORE)
set(PYBIND11_ENABLE_EXTRAS OFF)
endif()

if(WITH_HYGON AND NOT EXISTS "${DTK_ROOT}/llvm/lib/LLVMgold.so")
set(PYBIND11_ENABLE_EXTRAS OFF)
endif()

add_subdirectory(src)

if(NOT GENERATE_PYTHON_BINDINGS)
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# InfiniOps

InfiniOps is a high-performance, cross-platform operator library supporting multiple backends: CPU, Nvidia, MetaX, Iluvatar, Moore, Cambricon, and more.
InfiniOps is a high-performance, cross-platform operator library supporting multiple backends: CPU, Nvidia, MetaX, Iluvatar, Hygon, Moore, Cambricon, and more.

## Prerequisites

Expand Down Expand Up @@ -31,12 +31,16 @@ pip install . -C cmake.define.WITH_CPU=ON -C cmake.define.WITH_NVIDIA=ON
| `-DWITH_NVIDIA=[ON\|OFF]` | Compile the Nvidia implementation | OFF |
| `-DWITH_METAX=[ON\|OFF]` | Compile the MetaX implementation | OFF |
| `-DWITH_ILUVATAR=[ON\|OFF]` | Compile the Iluvatar implementation | OFF |
| `-DWITH_HYGON=[ON\|OFF]` | Compile the Hygon implementation | OFF |
| `-DWITH_MOORE=[ON\|OFF]` | Compile the Moore implementation | OFF |
| `-DWITH_CAMBRICON=[ON\|OFF]` | Compile the Cambricon implementation | OFF |
| `-DWITH_ASCEND=[ON\|OFF]` | Compile the Ascend implementation | OFF |
| `-DAUTO_DETECT_DEVICES=[ON\|OFF]` | Auto-detect available platforms | ON |

If no accelerator options are provided and auto-detection finds nothing, `WITH_CPU` is enabled by default.

For Hygon builds, set `DTK_ROOT` to the DTK installation root if it is not installed at `/opt/dtk`. You can override the default DCU arch with `-DHYGON_ARCH=<arch>` when configuring CMake.

## Contributing

See [CONTRIBUTING.md](CONTRIBUTING.md) for code style, commit conventions, PR workflow, development guide, and troubleshooting.
Expand Down
8 changes: 7 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,14 @@ foreach(source_file ${EXAMPLE_SOURCES})
target_link_libraries(${example_name} PRIVATE infiniops)

target_include_directories(${example_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})

get_filename_component(example_dir ${source_file} DIRECTORY)

target_include_directories(${example_name} PRIVATE ${example_dir})

if(WITH_TORCH)
foreach(_torch_dir ${TORCH_RUNTIME_DIRS})
target_link_options(${example_name} PRIVATE "LINKER:-rpath-link,${_torch_dir}")
endforeach()
endif()
endforeach()
45 changes: 42 additions & 3 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,34 @@ if(WITH_ILUVATAR)
list(APPEND DEVICE_LIST "iluvatar")
endif()

if(WITH_HYGON)
set(HYGON_PATTERNS
"native/cuda/*.cc"
"native/cuda/*.cpp"
"native/cuda/*.cu"
"native/cuda/hygon/*.cc"
"native/cuda/hygon/*.cpp"
"native/cuda/hygon/*.cu"
)

file(GLOB_RECURSE HYGON_SOURCES CONFIGURE_DEPENDS ${HYGON_PATTERNS})

enable_language(CUDA)

target_compile_definitions(infiniops PUBLIC WITH_HYGON=1)
target_sources(infiniops PRIVATE ${HYGON_SOURCES})

find_package(CUDAToolkit REQUIRED)
target_link_libraries(infiniops PUBLIC CUDA::cudart CUDA::cublas)

set_target_properties(infiniops PROPERTIES
CUDA_STANDARD 17
CUDA_STANDARD_REQUIRED ON
)

list(APPEND DEVICE_LIST "hygon")
endif()

if(WITH_METAX)
set(METAX_PATTERNS
"native/cuda/*.cc"
Expand Down Expand Up @@ -525,7 +553,7 @@ if(GENERATE_PYTHON_BINDINGS)
list(APPEND PYBIND11_COMPILE_SOURCES ${PYBIND11_DISPATCH_SOURCES})

# TODO: There might be a better solution.
if(WITH_NVIDIA OR WITH_ILUVATAR)
if(WITH_NVIDIA OR WITH_ILUVATAR OR WITH_HYGON)
set_source_files_properties(${PYBIND11_COMPILE_SOURCES} PROPERTIES LANGUAGE CUDA)
endif()

Expand Down Expand Up @@ -567,6 +595,13 @@ if(GENERATE_PYTHON_BINDINGS)
target_compile_options(ops PRIVATE "-x" "musa")
endif()

if(WITH_HYGON)
set_target_properties(ops PROPERTIES
RULE_LAUNCH_COMPILE "${_HYGON_RULE_LAUNCH_ENV}"
RULE_LAUNCH_LINK "${_HYGON_RULE_LAUNCH_ENV}"
)
endif()

target_include_directories(ops PRIVATE ${PROJECT_SOURCE_DIR})
target_link_libraries(ops PRIVATE infiniops)

Expand All @@ -580,9 +615,13 @@ if(GENERATE_PYTHON_BINDINGS)
target_link_libraries(ops PRIVATE
-Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
endif()
set(_INFINIOPS_INSTALL_RPATH "$ORIGIN")
if(WITH_TORCH)
list(APPEND _INFINIOPS_INSTALL_RPATH ${TORCH_RUNTIME_DIRS})
endif()
Comment on lines +618 to +621
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个地方会影响其他平台,是不是也应该测试一下啊?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

嗯嗯,计划就是初审完,改完一波再所有测试一遍


set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
set_target_properties(ops PROPERTIES INSTALL_RPATH "$ORIGIN")
set_target_properties(infiniops PROPERTIES INSTALL_RPATH "${_INFINIOPS_INSTALL_RPATH}")
set_target_properties(ops PROPERTIES INSTALL_RPATH "${_INFINIOPS_INSTALL_RPATH}")

install(TARGETS infiniops ops DESTINATION .)

Expand Down
40 changes: 40 additions & 0 deletions src/native/cuda/hygon/blas.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef INFINI_OPS_HYGON_BLAS_H_
#define INFINI_OPS_HYGON_BLAS_H_

#include <utility>

// clang-format off
#include "cublas_v2.h"
// clang-format on

#include "data_type.h"
#include "native/cuda/blas.h"
#include "native/cuda/hygon/blas_utils.h"
#include "native/cuda/hygon/runtime_.h"

namespace infini::ops {

template <>
struct Blas<Device::Type::kHygon> : public Runtime<Device::Type::kHygon> {
using BlasHandle = cublasHandle_t;

static constexpr auto BLAS_OP_N = CUBLAS_OP_N;

static constexpr auto BLAS_OP_T = CUBLAS_OP_T;

static constexpr auto BLAS_GEMM_DEFAULT = CUBLAS_GEMM_DEFAULT_TENSOR_OP;

static constexpr auto BlasCreate = cublasCreate;

static constexpr auto BlasSetStream = cublasSetStream;

static constexpr auto BlasDestroy = cublasDestroy;

static constexpr auto BlasGemmStridedBatchedEx = [](auto&&... args) {
return cublasGemmStridedBatchedEx(std::forward<decltype(args)>(args)...);
};
};

} // namespace infini::ops

#endif
Loading
Loading