Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add kmp_* wrapper for gomp environment #79

Merged
merged 11 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion lib/gc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ endif()
include(functions)

add_subdirectory(Dialect)
add_subdirectory(Transforms)
add_subdirectory(Transforms)
add_subdirectory(ExecutionEngine)
1 change: 1 addition & 0 deletions lib/gc/ExecutionEngine/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_subdirectory(CPURuntime)
15 changes: 15 additions & 0 deletions lib/gc/ExecutionEngine/CPURuntime/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
find_package(OpenMP REQUIRED)

if ("iomp" IN_LIST OpenMP_C_LIB_NAMES OR "omp" IN_LIST OpenMP_C_LIB_NAMES OR "omp5" IN_LIST OpenMP_C_LIB_NAMES)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this match "libiomp", "libiomp5", "libomp", "libomp5"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it should. :)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it should. :)

If so, then the "omp" IN_LIST OpenMP_C_LIB_NAMES will also match libgomp? And this is not what we want?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then the "omp" IN_LIST OpenMP_C_LIB_NAMES will also match libgomp

No, I suppose. libgomp has the name gomp in OpenMP_C_LIB_NAMES. IN_LIST of cmake performs a full-name search, instead of finding substr.

"libiomp", "libiomp5", "libomp", "libomp5"

It should only match iomp omp omp5

else()
add_definitions("-DGC_NEEDS_OMP_WRAPPER=1")
endif()

set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
add_mlir_library(GCCpuRuntime
SHARED
Parallel.cpp

EXCLUDE_FROM_LIBMLIR
)
188 changes: 188 additions & 0 deletions lib/gc/ExecutionEngine/CPURuntime/Parallel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
//===-- Parallel.cpp - parallel ---------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <assert.h>
#include <atomic>
#include <chrono>
#include <immintrin.h>
#include <omp.h>
#include <stdarg.h>

#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

#define WEAK_SYMBOL __attribute__((weak))

namespace {
struct barrier_t {
alignas(64) std::atomic<int32_t> pending_;
std::atomic<int32_t> rounds_;
uint64_t total_;
// pad barrier to size of cacheline to avoid false sharing
char padding_[64 - 4 * sizeof(int32_t)];
};

using barrier_idle_func = uint64_t (*)(std::atomic<int32_t> *remaining,
int32_t expected_remain, int32_t tid,
void *args);
} // namespace

extern "C" {
int gc_runtime_keep_alive = 0;
void gc_arrive_at_barrier(barrier_t *b, barrier_idle_func idle_func,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the usage scenario for the barrier?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is not currently used for now. But we will introduce cpu barrier ops in CPURuntime in near future.

void *idle_args) {
auto cur_round = b->rounds_.load(std::memory_order_acquire);
auto cnt = --b->pending_;
assert(cnt >= 0);
if (cnt == 0) {
b->pending_.store(b->total_);
b->rounds_.store(cur_round + 1);
} else {
if (idle_func) {
if (cur_round != b->rounds_.load()) {
return;
}
idle_func(&b->rounds_, cur_round + 1, -1, idle_args);
ciyongch marked this conversation as resolved.
Show resolved Hide resolved
}
while (cur_round == b->rounds_.load()) {
_mm_pause();
}
}
}

static_assert(sizeof(barrier_t) == 64, "size of barrier_t should be 64-byte");

void gc_init_barrier(barrier_t *b, int num_barriers, uint64_t thread_count) {
for (int i = 0; i < num_barriers; i++) {
b[i].total_ = thread_count;
b[i].pending_.store(thread_count);
b[i].rounds_.store(0);
}
}

#if GC_NEEDS_OMP_WRAPPER
void WEAK_SYMBOL __kmpc_barrier(void *loc, int32_t global_tid) {
#pragma omp barrier
}

int WEAK_SYMBOL __kmpc_global_thread_num(void *loc) {
return omp_get_thread_num();
}

// The implementation was extracted and simplified from LLVM libomp
// at openmp/runtime/src/kmp_sched.cpp
void WEAK_SYMBOL __kmpc_for_static_init_8u(void *loc, int32_t gtid,
int32_t schedtype,
int32_t *plastiter, uint64_t *plower,
uint64_t *pupper, int64_t *pstride,
int64_t incr, int64_t chunk) {
if (unlikely(schedtype != 34)) {
std::abort();
}
const int32_t FALSE = 0;
const int32_t TRUE = 1;
using UT = uint64_t;
// using ST = int64_t;
/* this all has to be changed back to TID and such.. */
uint32_t tid = gtid;
uint32_t nth = omp_get_num_threads();
UT trip_count;

/* special handling for zero-trip loops */
if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
if (plastiter != nullptr)
*plastiter = FALSE;
/* leave pupper and plower set to entire iteration space */
*pstride = incr; /* value should never be used */
return;
}

if (nth == 1) {
if (plastiter != nullptr)
*plastiter = TRUE;
*pstride =
(incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
return;
}

/* compute trip count */
if (incr == 1) {
trip_count = *pupper - *plower + 1;
} else if (incr == -1) {
trip_count = *plower - *pupper + 1;
} else if (incr > 0) {
// upper-lower can exceed the limit of signed type
trip_count = (UT)(*pupper - *plower) / incr + 1;
} else {
trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
Comment on lines +114 to +122

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems we can combine all these into a single statement trip_count = (UT)(*pupper - *plower) / incr + 1;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is copied and simplified from libomp of LLVM. I would like to keep the original code if possible.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, then it would be better to add a reference link here?

}
if (trip_count < nth) {
if (tid < trip_count) {
*pupper = *plower = *plower + tid * incr;
} else {
// set bounds so non-active threads execute no iterations
*plower = *pupper + (incr > 0 ? 1 : -1);
}
if (plastiter != nullptr)
*plastiter = (tid == trip_count - 1);
} else {
UT small_chunk = trip_count / nth;
UT extras = trip_count % nth;
*plower += incr * (tid * small_chunk + (tid < extras ? tid : extras));
*pupper = *plower + small_chunk * incr - (tid < extras ? 0 : incr);
if (plastiter != nullptr)
*plastiter = (tid == nth - 1);
}
*pstride = trip_count;
}

void WEAK_SYMBOL __kmpc_for_static_fini(void *ptr, int32_t v) {}

static thread_local int next_num_threads = 0;

/*!
@ingroup PARALLEL
The type for a microtask which gets passed to @ref __kmpc_fork_call().
The arguments to the outlined function are
@param global_tid the global thread identity of the thread executing the
function.
@param bound_tid the local identity of the thread executing the function
@param ... pointers to shared variables accessed by the function.
*/
using kmpc_micro = void (*)(int32_t *global_tid, int32_t *bound_tid, ...);
void WEAK_SYMBOL __kmpc_fork_call(void *loc, int32_t argc, void *pfunc, ...) {
if (unlikely(argc != 1 && argc != 0)) {
std::abort();
}
va_list ap;
va_start(ap, pfunc);
void *c = va_arg(ap, void *);
int32_t global_tid = 0;
if (unlikely(next_num_threads)) {
#pragma omp parallel num_threads(next_num_threads)
{
kmpc_micro func = (kmpc_micro)(pfunc);
func(&global_tid, nullptr, c);
}
next_num_threads = 0;
} else {
#pragma omp parallel
{
kmpc_micro func = (kmpc_micro)(pfunc);
func(&global_tid, nullptr, c);
}
}
va_end(ap);
}

void WEAK_SYMBOL __kmpc_push_num_threads(void *loc, int32_t global_tid,
int32_t num_threads) {
next_num_threads = num_threads;
}
#endif
}
20 changes: 19 additions & 1 deletion src/gc-cpu-runner/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
################################################################################
# Copyright (C) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions
# and limitations under the License.
# SPDX-License-Identifier: Apache-2.0
################################################################################

if(GC_DEV_LINK_LLVM_DYLIB)
set(LLVM_LINK_COMPONENTS
LLVM
Expand Down Expand Up @@ -36,7 +53,8 @@ endif()

#LLVM_LINK_COMPONENTS is processed by LLVM cmake in add_llvm_executable
set(gc_cpu_runner_libs
${MLIR_LINK_COMPONENTS})
${MLIR_LINK_COMPONENTS}
GCCpuRuntime)
add_mlir_tool(gc-cpu-runner
gc-cpu-runner.cpp
)
Expand Down
4 changes: 4 additions & 0 deletions src/gc-cpu-runner/gc-cpu-runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@
#include "llvm/Support/TargetSelect.h"
#include <stdio.h>

extern int gc_runtime_keep_alive;

int main(int argc, char **argv) {
// keeps GCCPURuntime linked
gc_runtime_keep_alive = 0;
llvm::InitLLVM y(argc, argv);
llvm::InitializeNativeTarget();
llvm::InitializeNativeTargetAsmPrinter();
Expand Down
37 changes: 37 additions & 0 deletions test/gc/cpu-runner/tid.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// RUN: gc-opt %s --convert-cpuruntime-to-llvm --convert-openmp-to-llvm --convert-func-to-llvm --convert-arith-to-llvm --convert-cf-to-llvm --reconcile-unrealized-casts | gc-cpu-runner -e main -entry-point-result=void | FileCheck %s
module {
func.func private @omp_get_thread_num() -> i32

func.func @check_parallel() {
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c8 = arith.constant 8 : index
%0 = llvm.mlir.constant(1 : i64) : i64
omp.parallel num_threads(%c8: index) {
omp.wsloop {
omp.loop_nest (%arg1, %arg2) : index = (%c0, %c0) to (%c1, %c64) step (%c1, %c1) {
cpuruntime.printf "ITR %zu\n" %arg2 : index
omp.yield
}
omp.terminator
}
%tid = func.call @omp_get_thread_num() : () -> i32
cpuruntime.printf "EXIT %d\n" %tid : i32
omp.terminator
}
return
}

func.func @main() {
%0 = func.call @omp_get_thread_num() : () -> i32
cpuruntime.printf "TID %d\n" %0 : i32
call @check_parallel() : ()->()
return
}
// CHECK: TID 0
// CHECK-COUNT-64: ITR {{[0-9]+}}
// CHECK-NOT: ITR
// CHECK-COUNT-8: EXIT {{[0-9]+}}
// CHECK-NOT: EXIT
}