#!/bin/bash

# hc-kernel-assemble kernel-bitcode kernel-object

# enable bash debugging
KMDBSCRIPT="${KMDBSCRIPT:=0}"

# dump the LLVM bitcode
KMDUMPLLVM="${KMDUMPLLVM:=0}"

AMDGPU_OBJ_CODEGEN="${AMDGPU_OBJ_CODEGEN:=0}"

# flag for function calls enabled
AMDGPU_FUNC_CALLS="0"

# inject additional GPU archs through an env var
HCC_EXTRA_GPU_ARCH="${HCC_EXTRA_GPU_ARCH:=0}"

if [ $KMDBSCRIPT == "1" ]
then
  set -x
fi

BINDIR=$(dirname "$0")
CLANG=$BINDIR/hcc
LLVM_LINK=$BINDIR/llvm-link
OPT=$BINDIR/opt
CLAMP_DEVICE=$BINDIR/clamp-device
LLVM_AS=$BINDIR/llvm-as
LLVM_DIS=$BINDIR/llvm-dis
CLAMP_ASM=$BINDIR/clamp-assemble
LIBPATH=$BINDIR/../lib
CLANG_OFFLOAD_BUNDLER=$BINDIR/clang-offload-bundler

# Add error handling functions
. $BINDIR/error-check

# At build directory, HCC compiler tools are placed under compiler/bin/, and
# header files are placed under include/. At install directory, HCC compiler
# tools are placed under bin/, and header files are placed under include/.
# Have -I$BIN/../include and -I$BINDIR/../../include ensures headers can
# always be found either in build directory or installed directory.
CXXFLAGS="-std=c++amp -I$BINDIR/../include -I$BINDIR/../../include -fPIC"

# Set additional CXXFLAGS based on CMAKE_BUILD_TYPE
shopt -s nocasematch
CMAKE_BUILD_TYPE="Release"
case $CMAKE_BUILD_TYPE in
  release)
    CXXFLAGS=$CXXFLAGS" -O3"
    ;;
  relwithdebinfo)
    CXXFLAGS=$CXXFLAGS" -O2 -g"
    ;;
  minsizerel)
    CXXFLAGS=$CXXFLAGS" -Os"
    ;;
  debug)
    CXXFLAGS=$CXXFLAGS" -g"
    ;;
  *)
    CXXFLAGS=$CXXFLAGS
esac

if [ "$#" -lt 2 ]; then
  echo "Usage: $0 kernel-bitcode object" >&2
  exit_with_code -1
fi

AMDGPU_TARGET_ARRAY=()
if [ $HCC_EXTRA_GPU_ARCH != "0" ]
 then
  AMDGPU_TARGET_ARRAY+=("$HCC_EXTRA_GPU_ARCH")
fi


for ARG in "$@"
do
  case $ARG in
    ######################
    # Parse AMDGPU target
    ######################
    --amdgpu-target=*)
    AMDGPU_TARGET_ARRAY+=("${ARG#*=}")
    continue
    ;;
    --early_finalize)
    AMDGPU_OBJ_CODEGEN=1
    continue
    ;;
    --amdgpu-func-calls)
    AMDGPU_FUNC_CALLS="1"
    continue
    ;;
  esac
done

# inject a new GPU IR to replace the IR from the FE
HC_REPLACE_GPU_FE_LLVM="${HC_REPLACE_GPU_FE_LLVM:=0}"
KERNEL_INPUT=$1
if [ $HC_REPLACE_GPU_FE_LLVM != "0" ]; then
  KERNEL_INPUT=$HC_REPLACE_GPU_FE_LLVM
  echo "Replacing GPU input IR from clang with $KERNEL_INPUT" 
fi

if [ ! -f "$KERNEL_INPUT" ]; then
  echo "kernel-bitcode $KERNEL_INPUT is not valid" >&2
  exit_with_code -1
fi

CO="-c -o"

TEMP_DIR=`mktemp -d`
BASENAME=`basename "$2"`
TEMP_NAME="$TEMP_DIR/$BASENAME"

# ELF section names for IR and ISA
KERNEL_IR_SECTION=".kernel_ir"
KERNEL_SECTION=".kernel"

# hip-kernel-assemble goes after hip-host-assemble, so attempt to link object from host
if [ -f "$2" ]; then
  mv "$2" "$TEMP_DIR/$BASENAME.tmp.o"
  check_exit_status mv
fi

cp "$KERNEL_INPUT" "$TEMP_NAME.bc"
check_exit_status cp
if [ $KMDUMPLLVM == "1" ]; then
  $LLVM_DIS "$TEMP_NAME.bc" -o "$TEMP_NAME.ll"
  cp "$TEMP_NAME.ll" ./dump.kernel_input.ll
fi

if [ $AMDGPU_OBJ_CODEGEN == 1 ]; then
  # invoke the backend to finalize the IR

  # touch an empty object for host part, to accomodate rule required by
  # clang-offload-bundler
  touch $TEMP_DIR/__empty.o

  # invoke clang-offload-bundler to create kernel bundle
  CLANG_OFFLOAD_BUNDLER_INPUTS_ARGS="-inputs=$TEMP_DIR/__empty.o"
  CLANG_OFFLOAD_BUNDLER_TARGETS_ARGS="-targets=host-x86_64-unknown-linux"

  declare -a pids
  # for each GPU target, lower to GCN ISA in HSACO format
  for AMDGPU_TARGET in ${AMDGPU_TARGET_ARRAY[@]}; do
    if [ $AMDGPU_FUNC_CALLS == "1" ]; then
      { $CLAMP_DEVICE "$KERNEL_INPUT" "$TEMP_DIR/kernel-$AMDGPU_TARGET.hsaco" --amdgpu-target=$AMDGPU_TARGET --early-finalize --amdgpu-func-calls; } &
    else
      { $CLAMP_DEVICE "$KERNEL_INPUT" "$TEMP_DIR/kernel-$AMDGPU_TARGET.hsaco" --amdgpu-target=$AMDGPU_TARGET --early-finalize ; } &
    fi

    # error handling
    pids+=("${pids[@]}" "$!")

    # augment arguments to clang-offload-bundler
    CLANG_OFFLOAD_BUNDLER_INPUTS_ARGS+=",$TEMP_DIR/kernel-$AMDGPU_TARGET.hsaco"
    CLANG_OFFLOAD_BUNDLER_TARGETS_ARGS+=",hcc-amdgcn-amd-amdhsa--$AMDGPU_TARGET"
  done

  # collect all the error codes from forked processes
  # error handling
  ret=0
  for pid in ${pids[*]}; do
    wait $pid || ret=$((ret+$?))
  done
  if [ $ret != 0 ]; then
    exit_with_code -1
  fi

  # invoke clang-offload-bundler
  $CLANG_OFFLOAD_BUNDLER -type=o $CLANG_OFFLOAD_BUNDLER_INPUTS_ARGS $CLANG_OFFLOAD_BUNDLER_TARGETS_ARGS -outputs=$TEMP_DIR/kernel.bundle
  check_exit_status $CLANG_OFFLOAD_BUNDLER

  OLD_PWD=$PWD
  cd $TEMP_DIR
  $CLAMP_ASM "kernel.bundle" "$TEMP_NAME.camp.o" "$KERNEL_SECTION"
  check_exit_status $CLAMP_ASM
  cd $OLD_PWD
else
  ln -s "$KERNEL_INPUT" "$1.bc"
  $CLAMP_ASM "$1.bc" "$TEMP_NAME.camp.o" "$KERNEL_IR_SECTION"
  check_exit_status $CLAMP_ASM
fi

if [ -f "$TEMP_DIR/$BASENAME.tmp.o" ]; then
  ld -r --allow-multiple-definition "$TEMP_DIR/$BASENAME.tmp.o" "$TEMP_NAME.camp.o" -o "$2"
  check_exit_status ld
else
  mv "$TEMP_NAME.camp.o" "$2"
  check_exit_status mv
fi

rm -rf "$TEMP_DIR"
