Skip to content

[cmake] use wfn91's linear algebra discovery modules #254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
May 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ variables:
${TA_PYTHON}
${ENABLE_CUDA}
${BLA_VENDOR}
${BLA_THREADS}
${ENABLE_SCALAPACK}

before_script:
Expand All @@ -37,8 +38,9 @@ ubuntu:
${TA_CONFIG}
${TA_TARGETS}
MPIEXEC_PREFLAGS='--bind-to;none;--allow-run-as-root'
blacs_LIBRARIES=scalapack-openmpi
scalapack_LIBRARIES=scalapack-openmpi
ScaLAPACK_LIBRARIES="scalapack-openmpi;lapack;blas;MPI::MPI_C"
#blacs_LIBRARIES=scalapack-openmpi
#scalapack_LIBRARIES=scalapack-openmpi
#lapack_LIBRARIES=lapack
artifacts:
paths:
Expand All @@ -53,7 +55,8 @@ ubuntu:
- IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
CXX: [ g++ ]
BUILD_TYPE : [ "Release" ]
BLA_VENDOR : [ "BLA_VENDOR=Intel10_64lp_seq", "BLA_VENDOR=Intel10_64lp" ]
BLA_VENDOR : [ "BLAS_PREFERENCE_LIST=IntelMKL" ]
BLA_THREADS : [ "intelmkl_PREFERED_THREAD_LEVEL=tbb" ]
# ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL
- IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
Expand All @@ -64,4 +67,4 @@ ubuntu:
CXX: [ g++ ]
BUILD_TYPE : [ "Release", "Debug" ]
ENABLE_CUDA : [ "ENABLE_CUDA=ON" ]
TA_TARGETS : [ "tiledarray examples" ]
TA_TARGETS : [ "tiledarray examples" ]
40 changes: 37 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@

cmake_minimum_required (VERSION 3.15.0) # need list(PREPEND for toolchains

# Preload versions/tags of all dependencies ====================================
include(external/versions.cmake)

###############################################################################
# Bring ValeevGroup cmake toolkit
###############################################################################
Expand All @@ -34,6 +37,7 @@ FetchContent_Populate(
vg_cmake_kit
QUIET
GIT_REPOSITORY https://github.com/ValeevGroup/kit-cmake.git
GIT_TAG ${TA_TRACKED_VGCMAKEKIT_TAG}
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/vg
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/vg-build
SUBBUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/vg-subbuild
Expand Down Expand Up @@ -93,9 +97,6 @@ include(CMakeDependentOption)
include(CMakePackageConfigHelpers)
include(FeatureSummary)

# Preload versions/tags of all dependencies ====================================
include(external/versions.cmake)

set(MPI_CXX_SKIP_MPICXX TRUE CACHE BOOL "MPI_CXX_SKIP_MPICXX")

# Configure options =======================================================
Expand All @@ -105,6 +106,9 @@ add_feature_info(MPI ENABLE_MPI "Message-Passing Interface supports distributed-
option(ENABLE_SCALAPACK "Enable ScaLAPACK Bindings in TiledArray" OFF)
add_feature_info(ScaLAPACK ENABLE_SCALAPACK "ScaLAPACK provides distributed linear algebra")

option(ENABLE_WFN91_LINALG_DISCOVERY_KIT "Use linear algebra discovery kit from github.com/wavefunction91 [recommended]" ON)
add_feature_info(WFN91LinearAlgebraDiscoveryKit ENABLE_WFN91_LINALG_DISCOVERY_KIT "Linear algebra discovery kit from github.com/wavefunction91 supports many more corner cases than the default CMake modules and/or ICL's BLAS++/LAPACK++ modules")

redefaultable_option(ENABLE_TBB "Enable use of TBB with MADNESS" OFF)
add_feature_info(TBB ENABLE_TBB "Intel Thread-Building Blocks (TBB) supports programming shared-memory systems")

Expand Down Expand Up @@ -286,6 +290,36 @@ endif()
include(external/madness.cmake)
detect_MADNESS_configuration()
include(external/eigen.cmake)

###### discover linear algebra

# use NWChemEx/David's linear algebra discovery modules?
# - yes => Invoke first to configure the correct libraries config and run modules to find BLAS/LAPACK/ScaLAPACK(if needed)
# - no => BLAS/LAPACK will be discovered by BLAS++/LAPACK++ (loaded by BTAS) which use standard CMake modules or
# their custom modules; if needed, ScaLAPACK will be discovered by BLACS++
if (ENABLE_WFN91_LINALG_DISCOVERY_KIT)
include("${vg_cmake_kit_SOURCE_DIR}/modules/FetchWfn91LinAlgModules.cmake")
if(ENABLE_SCALAPACK)
find_package(ScaLAPACK REQUIRED)
# Propagate ScaLAPACK -> BLAS/LAPACK if not set
# (ScaLAPACK necessarily contains a BLAS/LAPACK linker by standard)
# TODO: Tell David to write a macro that hides this verbosity from user space
if( NOT BLAS_LIBRARIES )
set( BLAS_LIBRARIES "${ScaLAPACK_LIBRARIES}" CACHE STRING "BLAS LIBRARIES" )
endif()
if( NOT LAPACK_LIBRARIES )
set( LAPACK_LIBRARIES "${ScaLAPACK_LIBRARIES}" CACHE STRING "LAPACK LIBRARIES" )
endif()
else(ENABLE_SCALAPACK)
find_package(LAPACK REQUIRED)
# Propagate LAPACK -> BLAS if not set
# (LAPACK necessacarily contains a BLAS linker by standard)
# TODO: Tell David to write a macro that hides this verbosity from user space
if( NOT BLAS_LIBRARIES )
set( BLAS_LIBRARIES "${LAPACK_LIBRARIES}" CACHE STRING "BLAS LIBRARIES" )
endif()
endif(ENABLE_SCALAPACK)
endif(ENABLE_WFN91_LINALG_DISCOVERY_KIT)
# BTAS does a better job of building and checking Boost since it uses Boost::serialization
# it also memorized the location of its config for use from install tree
include(FindOrFetchBTAS)
Expand Down
89 changes: 70 additions & 19 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ to use TiledArray in a distributed memory environment. Note, if you
build MADNESS yourself, you must also configure MADNESS with `ENABLE_MPI=OFF`
to enable this option.

## BLAS/LAPACK
## Linear Algebra: BLAS/LAPACK/ScaLAPACK

Even for basic operation TiledArray requires a serial BLAS implementation, either by linking with a
serial version of the BLAS library or by setting the number of threads to one
Expand All @@ -230,38 +230,88 @@ set the environment variables (e.g. `OMP_NUM_THREADS`, `MKL_NUM_THREADS`, etc.)
to ensure single-threaded execution of BLAS/LAPACK kernels
as needed.


As of version 1.0 TiledArray also provides a direct (non-iterative) linear solvers API
implemented using LAPACK and (optionally) ScaLAPACK. Therefore LAPACK is now a mandatory
prerequisite of TiledArray

BLAS/LAPACK dependencies are provided by the BTAS library, which in turn uses BLAS++/LAPACK++
C++ linear algebra packages to discover the BLAS and LAPACK libraries at configure time.
The most common scenario is where TiledArray will configure and compile BTAS dependency
and its BLAS++/LAPACK++ prerequisites from source (this is strongly recommended). The following
CMake variables can be used to control how BLAS/LAPACK discovery occurs:

* `BLA_STATIC` -- indicates whether static or shared LAPACK and BLAS libraries will be preferred.
* `BLA_VENDOR` -- controls which vendor BLAS/LAPACK library will be sought
prerequisite of TiledArray. The use of ScaLAPACK can be enabled by setting CMake cache
variable `ENABLE_SCALAPACK` to `ON`.

Robust discovery of linear algebra libraries, and _especially_ their distributed-memory
variants, is a complex process. Unfortunately even for serial/shared-memory linear
algebra libraries only basic scenarios are supported by the standard CMake modules
(e.g., [BLAS](https://cmake.org/cmake/help/latest/module/FindBLAS.html) and
[LAPACK](https://cmake.org/cmake/help/latest/module/FindLAPACK.html)).
There are several discovery mechanisms available for robust discovery of linear
algebra in TA:
- By specifying the `BLAS_LIBRARIES`, `LAPACK_LIBRARIES`, and (if `ENABLE_SCALAPACK` is on)
`ScaLAPACK_LIBRARIES` CMake cache variables via CMake command line or via a toolchain.
Doing this overrides all other mechanisms of discovery described below and is recommended
if the discovery fails for some reason. To help with setting these variables for specific
platforms consider using toolchain files from
[the Valeev Group CMake kit](https://github.com/ValeevGroup/kit-cmake/tree/master/toolchains) (see examples above).
- The default discovery method utilizes [the recently developed linear algebra discovery kit](https://github.com/wavefunction91/cmake-modules) developed by
[David Williams-Young](https://github.com/wavefunction91) and co-workers
for the [NWChemEx project](https://github.com/)NWChemEx-Project). The
discovery modules will override the standard CMake modules for BLAS and LAPACK,
provide modules to discover BLACS and ScaLAPACK. These modules
will then be invoked to discover the linear algebra libraries as robustly as
feasible. The following CMake cache variables can be used to control the behavior
of the NWChemEx discovery kit:
- `{BLAS,LAPACK,ScaLAPACK}_PREFERS_STATIC`: if set to on, will prefer to link the corresponding component statically.
- `{BLAS,LAPACK,ScaLAPACK}_PREFERENCE_LIST`: these specify the variants of the corresponding libraries to search,
in the order of preference. The following are permitted values in these lists:
- `ReferenceBLAS`: NETLIB reference implementations
- `IntelMKL`: Intel Math Kernel Library
- `IBMESSL`: IBM Engineering and Scientific Subroutine Library
- `BLIS`: BLAS-Like Instantiation Software
- `OpenBLAS`: OpenBLAS
- `Accelerate`: Apple's Accelerate framework
- `FLAME`: (LAPACK-only) [libFLAME](https://www.cs.utexas.edu/~flame/web/libFLAME.html)
*N.B.* These differ from the recognized values of the `BLA_VENDOR` variable used by the [BLAS+LAPACK CMake modules](https://cmake.org/cmake/help/latest/module/FindBLAS.html).
- If the use of the NWChemEx kit is disabled by setting CMake cache variable `ENABLE_WFN91_LINALG_DISCOVERY_KIT` to `OFF`
BLAS/LAPACK are imported transitively via the BLAS++/LAPACK++ libraries (which are themselves
imported transitively via the BTAS library). Under the most common scenario, where TiledArray
will configure and compile BTAS dependency and its BLAS++/LAPACK++ prerequisites from source
(this is strongly recommended), BLAS/LAPACK will thus be discovered and imported by
BLAS++/LAPACK++ during the TA configuration. There are 2 mechanisms by which BLAS++/LAPACK++
discover BLAS/LAPACK:
- _the built-in custom discovery kit_; no options exist to provide any control
- standard CMake BLAS/LAPACK modules.

The latter is used if CMake cache variable `BLA_VENDOR` is specified:
- `BLA_VENDOR` -- controls which vendor BLAS/LAPACK library will be sought

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't use BLA_VENDOR, we set a priority list BLAS_PREFERENCE_LIST which will have the same effect as BLA_VENDOR when set to a single item. We do this to decouple LAPACK/BLAS (e.g. in the case of BLIS + NETLIB / BLIS + FLAME), but due to the hierarchy discovery, allowing full discovery (e.g. setting BLAS_PREFERENCE_LIST="IntelMKL" and letting FindLAPACK satisfy the FindBLAS dependency and recognize that it contains a LAPACK linker) will do the right thing.

We could add a BLA_VENDOR <-> BLAS_PREFERENCE_LIST compatibility, or you propagate in the VG CMake kit, which ever you prefer

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is least surprising to emulate Find{BLAS,LAPACK} as much as reasonable, no?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i.e. I think it's reasonable to read same input vars as those, i.e. read BLA_{VENDOR,STATIC}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose that is the least surprising behaviour, I'll patch that today

(see [CMake docs](https://cmake.org/cmake/help/latest/module/FindLAPACK.html));
by default all possible vendor libraries will be considered. E.g., to force the use of the Accelerate
framework on MacOS use `-DBLA_VENDOR=Apple`.

More information can be found in the installation instructions for
[BLAS++](https://icl.bitbucket.io/blaspp/md__i_n_s_t_a_l_l.html) and
[LAPACK++](https://icl.bitbucket.io/lapackpp/md__i_n_s_t_a_l_l.html).
More information can be found in the installation instructions for
[BLAS++](https://icl.bitbucket.io/blaspp/md__i_n_s_t_a_l_l.html) and
[LAPACK++](https://icl.bitbucket.io/lapackpp/md__i_n_s_t_a_l_l.html).

Note that BLAS++/LAPACK++ discover BLAS and LAPACK only; ScaLAPACK
library is always discovered using the NWChemEx kit.

Also note that all discovery methods respect the following CMake cache variable:
- `BLA_STATIC` -- indicates whether static or shared LAPACK and BLAS libraries will be preferred.

Additional platform-specific BLAS/LAPACK notes are listed below.

### Intel Math Kernel Library (MKL)

Intel MKL is a freely-available collection of high-performance libraries that implements BLAS, LAPACK, and ScaLAPACK APIs. MKL is complex: it supports both serial kernels as well as parallel kernels that can take advantage of multiple cores via the use of OpenMP and Intel TBB (the [Intel OneAPI toolkit](https://software.intel.com/oneapi) provides MKL also capable of execution on some Intel GPUs and FPGAs), and the [necessary MKL link options](https://software.intel.com/sites/products/mkl/mkl_link_line_advisor.htm) will depend on the compiler, OS, and other details.

Fortunately, Intel MKL can be discovered by BLAS++/LAPACK++ automatically in most instances; if needed, specifying `BLA_VENDOR` with [appropriate argument](https://cmake.org/cmake/help/latest/module/FindBLAS.html#input-variables) can be used to force TiledArray to use MKL. Unfortunately it is not possible to specify the use of TBB-based backend for MKL without the use of a toolchain file. All MKL-enabled toolchains in [The Valeev Group CMake kit](https://github.com/ValeevGroup/kit-cmake/tree/master/toolchains) can be used to configure TiledArray to use sequential, OpenMP, or TBB backend by setting the `MKL_THREADING` CMake cache variable to `SEQ`, `OMP`, or `TBB`, respectively. The toolchains also respect the user-provided choice of `BLA_STATIC`. If multiple MKL versions are present on your system, specify the apropriate variant of the library by loading the corresponding `mklvars.sh` script to set environment variables `MKLROOT` and, if necessary, `LD_LIBRARY_PATH`/`DYLD_LIBRARY_PATH`.
To discover and configure the use of Intel MKL consider these suggestions:
- The use of NWChemEx discovery kit is strongly recommended for discovering Intel MKL. The following CMake cache variables can be used to specify the desired Intel MKL configuration:
- `intelmkl_PREFERS_STATIC`: whether to look for static or shared/dynamic libraries (default = `OFF`)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be covered by BLAS_PREFERS_STATIC

- `intelmkl_PREFERED_THREAD_LEVEL`: which threading backend to use, supported values are `sequential`, `openmp`, and `tbb` (default = `openmp`)
- `intelmkl_PREFERED_THREAD_LIBRARY`: which thread library to use, supported values are `intel`, `gnu`, and `pgi` (default depends on the compile)

Also note that even if OpenMP or TBB backends are used, TiledArray will be default set the number of threads to be used by MKL kernels to 1, regardless of the value of environment variables `MKL_NUM_THREADS`/`OMP_NUM_THREADS`. It is possible to change the number of threads to be used programmatically in your application by calling MKL function `mkl_set_num_threads()`.
- Most common configurations of Intel MKL can also be discovered by BLAS++/LAPACK++ automatically; if needed, specifying `BLA_VENDOR` with [appropriate argument](https://cmake.org/cmake/help/latest/module/FindBLAS.html#input-variables) can be used to force TiledArray to use MKL. Unfortunately it is not possible to specify the use of TBB-based backend for MKL without the use of a toolchain file.

- All MKL-enabled toolchains in [The Valeev Group CMake kit](https://github.com/ValeevGroup/kit-cmake/tree/master/toolchains) can be used to configure TiledArray to use sequential, OpenMP, or TBB backend by setting the `MKL_THREADING` CMake cache variable to `SEQ`, `OMP`, or `TBB`, respectively. The toolchains also respect the user-provided choice of `BLA_STATIC`. If multiple MKL versions are present on your system, specify the apropriate variant of the library by loading the corresponding `mklvars.sh` script to set environment variables `MKLROOT` and, if necessary, `LD_LIBRARY_PATH`/`DYLD_LIBRARY_PATH`.

On 64-bit platforms it is possible to specify whether to use 32-bit (`LP64`, the default) or 64-bit (`ILP64`) integers in BLAS/LAPACK API. To choose the `ILP64` interface when using the VG MKL toolchains set CMake cache variable `INTEGER4` to `OFF`; the same is achieved when using the default BLAS/LAPACK detection by setting `BLA_VENDOR` to [one of the valid `Intel*64ilp*` choices](https://cmake.org/cmake/help/latest/module/FindBLAS.html#input-variables). N.B. Currently `ILP64` variant of BLACS/ScaLAPACK is not supported, due to [a pending issue](https://github.com/wavefunction91/blacspp/issues/5).

On 64-bit platforms it is possible to specify whether to use 32-bit (`LP64`, the default) or 64-bit (`ILP64`) integers in BLAS/LAPACK API. To choose the `ILP64` interface when using the VG MKL toolchains set CMake cache variable `INTEGER4` to `OFF`; the same is achieved when using the default BLAS/LAPACK detection by setting `BLA_VENDOR` to [one of the valid `Intel*64ilp*` choices](https://cmake.org/cmake/help/latest/module/FindBLAS.html#input-variables). N.B. Currently `ILP64` variant of BLACS/ScaLAPACK is not supported, due to [a pending issue](https://github.com/wavefunction91/blacspp/issues/5).
Also note that even if OpenMP or TBB backends are used, TiledArray will be default set the number of threads to be used by MKL kernels to 1, regardless of the value of environment variables `MKL_NUM_THREADS`/`OMP_NUM_THREADS`. It is possible to change the number of threads to be used programmatically in your application by calling MKL function `mkl_set_num_threads()`.

## CUDA

Expand Down Expand Up @@ -304,11 +354,12 @@ the correct revision of MADNESS.
The following CMake options may be used to modify build behavior or find MADNESS:

* `ENABLE_MPI` -- Enable MPI [Default=ON]
* `ENABLE_SCALAPACK` -- Enable use of ScaLAPACK bindings [Default=OFF]
* `ENABLE_SCALAPACK` -- Enable the use of ScaLAPACK bindings [Default=OFF]
* `ENABLE_TBB` -- Enable the use of TBB when building MADNESS [Default=ON]
* `ENABLE_GPERFTOOLS` -- Enable the use of gperftools when building MADNESS [Default=OFF]
* `ENABLE_TCMALLOC_MINIMAL` -- Enable the use of gperftool's tcmalloc_minimal library only (the rest of gperftools is skipped) when building MADNESS [Default=OFF]
* `ENABLE_LIBUNWIND` -- Force the discovery of libunwind library when building MADNESS [Default=OFF]
* `ENABLE_WFN91_LINALG_DISCOVERY_KIT` -- Enable the use of NWChemEx's linear algebra discovery [Default=ON]
* `MADNESS_SOURCE_DIR` -- Path to the MADNESS source directory
* `MADNESS_BINARY_DIR` -- Path to the MADNESS build directory
* `MADNESS_URL` -- Path to the MADNESS repository [Default=MADNESS git repository]
Expand Down
15 changes: 7 additions & 8 deletions ci/.build-project
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ while [ $# -gt 0 ]; do
-*) echo "Invalid option $arg"; exit 1 ;;
CXX=*) eval "export $1" ;;
*=*) vars+="\"-D$1\" "
# NB unset vars from s.t. CMake doesn't see env vars, eg BLA_VENDOR
# NB unset vars from s.t. CMake doesn't see env vars, eg BLA_VENDOR
unset $(echo "$1" | cut -d= -f1)
;;
*) targets+="$1 ";;
Expand All @@ -33,8 +33,8 @@ xtime="/usr/bin/time"
if [ -n "$metrics" ]; then
#sudo apt install time
if [ ! -x $xtime ]; then
echo >&2 "${xtime} is not executable, metrics disabled"
metrics=""
echo >&2 "${xtime} is not executable, metrics disabled"
metrics=""
fi
rm -f "${metrics}"
fi
Expand All @@ -44,9 +44,9 @@ function time_cmd {
cmd="$@"
echo "+ $cmd"
if [ -n "$metrics" ]; then
format="'${step}.memory %Mk\n${step}.time %E\n'"
cmd="${xtime} -f ${format} -a -o $metrics $cmd"
#echo "$cmd"
format="'${step}.memory %Mk\n${step}.time %E\n'"
cmd="${xtime} -f ${format} -a -o $metrics $cmd"
#echo "$cmd"
fi
eval "$cmd"
}
Expand All @@ -73,7 +73,7 @@ section_end host_system_info
section_start "preparing_system_section[collapsed=true]" "Preparing system"
cmd "source ci/openmpi.env"
cmd "echo 'localhost slots=2' > /etc/openmpi/openmpi-default-hostfile"
if [[ "$vars" =~ \"-DBLA_VENDOR=Intel ]]; then
if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then
cmd "make -C /home/ValeevGroup install/intel-mkl"
cmd "source /opt/intel/mkl/bin/mklvars.sh intel64"
cmd "echo MKLROOT=\$MKLROOT"
Expand All @@ -94,4 +94,3 @@ for target in ${targets}; do
time_cmd ${target} "cmake --build ${build_dir} --target ${target}"
section_end build_${target}_section
done

Loading