From 4ceef429cf4103641c228ed2549e88a60c9b2567 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 14 Nov 2024 03:13:52 +0800
Subject: [PATCH] [CIR] [Lowering] [X86_64] Support VAArg for LongDouble
 (#1101)

This is the following of https://github.com/llvm/clangir/pull/1100.

After https://github.com/llvm/clangir/pull/1100, when we want to use
LongDouble for VAArg, we will be in trouble due to details in X86_64's
ABI and this patch tries to address this.

The practical impact the patch is, after this patch, with
https://github.com/llvm/clangir/pull/1088 and a small following up fix,
we can build and run all C's benchmark in SpecCPU 2017. I think it is a
milestone.
---
 clang/lib/CIR/Dialect/IR/CIRTypes.cpp         |   3 +-
 .../Transforms/TargetLowering/ABIInfoImpl.cpp |   7 ++
 .../Transforms/TargetLowering/ABIInfoImpl.h   |   1 +
 .../TargetLowering/CIRLowerContext.cpp        |  12 ++
 .../Targets/LoweringPrepareX86CXXABI.cpp      |   5 +-
 .../Transforms/TargetLowering/Targets/X86.cpp | 116 +++++++++++++++++-
 .../TargetLowering/Targets/X86_64ABIInfo.h    |   8 ++
 clang/test/CIR/Lowering/var-arg-x86_64.c      |  53 ++++++++
 8 files changed, 197 insertions(+), 8 deletions(-)
diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
index e579fe4c2f0c..21584eeaefb9 100644
--- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
@@ -747,7 +747,7 @@ const llvm::fltSemantics &FP80Type::getFloatSemantics() const {
 llvm::TypeSize
 FP80Type::getTypeSizeInBits(const mlir::DataLayout &dataLayout,
                             mlir::DataLayoutEntryListRef params) const {
-  return llvm::TypeSize::getFixed(16);
+  return llvm::TypeSize::getFixed(128);
 }
 
 uint64_t FP80Type::getABIAlignment(const mlir::DataLayout &dataLayout,
@@ -768,6 +768,7 @@ const llvm::fltSemantics &FP128Type::getFloatSemantics() const {
 llvm::TypeSize
 FP128Type::getTypeSizeInBits(const mlir::DataLayout &dataLayout,
                              mlir::DataLayoutEntryListRef params) const {
+  // FIXME: We probably want it to return 128. But we're lacking a test now.
   return llvm::TypeSize::getFixed(16);
 }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp
index e07315d54a38..15b54b526aeb 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp
@@ -65,4 +65,11 @@ CIRCXXABI::RecordArgABI getRecordArgABI(const StructType RT,
   return CXXABI.getRecordArgABI(RT);
 }
 
+CIRCXXABI::RecordArgABI getRecordArgABI(mlir::Type ty, CIRCXXABI &CXXABI) {
+  auto sTy = mlir::dyn_cast<StructType>(ty);
+  if (!sTy)
+    return CIRCXXABI::RAA_Default;
+  return getRecordArgABI(sTy, CXXABI);
+}
+
 } // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h
index 8005b153a544..8088a333c4a5 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h
@@ -33,6 +33,7 @@ mlir::Value emitRoundPointerUpToAlignment(cir::CIRBaseBuilderTy &builder,
 mlir::Type useFirstFieldIfTransparentUnion(mlir::Type Ty);
 
 CIRCXXABI::RecordArgABI getRecordArgABI(const StructType RT, CIRCXXABI &CXXABI);
+CIRCXXABI::RecordArgABI getRecordArgABI(mlir::Type ty, CIRCXXABI &CXXABI);
 
 } // namespace cir
 
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRLowerContext.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRLowerContext.cpp
index c6960d411b93..09013f113015 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRLowerContext.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRLowerContext.cpp
@@ -94,6 +94,18 @@ clang::TypeInfo CIRLowerContext::getTypeInfoImpl(const mlir::Type T) const {
       Align = Target->getDoubleAlign();
       break;
     }
+    if (auto longDoubleTy = mlir::dyn_cast<LongDoubleType>(T)) {
+      if (getLangOpts().OpenMP && getLangOpts().OpenMPIsTargetDevice &&
+          (Target->getLongDoubleWidth() != AuxTarget->getLongDoubleWidth() ||
+           Target->getLongDoubleAlign() != AuxTarget->getLongDoubleAlign())) {
+        Width = AuxTarget->getLongDoubleWidth();
+        Align = AuxTarget->getLongDoubleAlign();
+      } else {
+        Width = Target->getLongDoubleWidth();
+        Align = Target->getLongDoubleAlign();
+      }
+      break;
+    }
     cir_cconv_unreachable("Unknown builtin type!");
     break;
   }
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
index ba376d26b0fc..d9b574fac4e7 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
@@ -46,7 +46,6 @@ std::unique_ptr<cir::LowerModule> getLowerModule(cir::VAArgOp op) {
   mlir::ModuleOp mo = op->getParentOfType<mlir::ModuleOp>();
   if (!mo)
     return nullptr;
-
   mlir::PatternRewriter rewriter(mo.getContext());
   return cir::createLowerModule(mo, rewriter);
 }
@@ -92,7 +91,7 @@ mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
   // Let's hope LLVM's va_arg instruction can take care of it.
   // Remove this when X86_64ABIInfo::classify can take care of every type.
   if (!mlir::isa<VoidType, IntType, SingleType, DoubleType, BoolType,
-                 StructType>(op.getType()))
+                 StructType, LongDoubleType>(op.getType()))
     return nullptr;
 
   // Assume that va_list type is correct; should be pointer to LLVM type:
@@ -107,7 +106,6 @@ mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
   std::unique_ptr<cir::LowerModule> lowerModule = getLowerModule(op);
   if (!lowerModule)
     return nullptr;
-
   mlir::Type ty = op.getType();
 
   // FIXME: How should we access the X86AVXABILevel?
@@ -167,7 +165,6 @@ mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
   mlir::Block *contBlock = currentBlock->splitBlock(op);
   mlir::Block *inRegBlock = builder.createBlock(contBlock);
   mlir::Block *inMemBlock = builder.createBlock(contBlock);
-
   builder.setInsertionPointToEnd(currentBlock);
   builder.create<BrCondOp>(loc, inRegs, inRegBlock, inMemBlock);
 
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
index 39bd1716aa3b..fc65d4053649 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
@@ -165,6 +165,21 @@ void X86_64ABIInfo::classify(mlir::Type Ty, uint64_t OffsetBase, Class &Lo,
       Current = Class::SSE;
       return;
 
+    } else if (mlir::isa<LongDoubleType>(Ty)) {
+      const llvm::fltSemantics *LDF =
+          &getContext().getTargetInfo().getLongDoubleFormat();
+      if (LDF == &llvm::APFloat::IEEEquad()) {
+        Lo = Class::SSE;
+        Hi = Class::SSEUp;
+      } else if (LDF == &llvm::APFloat::x87DoubleExtended()) {
+        Lo = Class::X87;
+        Hi = Class::X87Up;
+      } else if (LDF == &llvm::APFloat::IEEEdouble()) {
+        Current = Class::SSE;
+      } else {
+        llvm_unreachable("unexpected long double representation!");
+      }
+      return;
     } else if (mlir::isa<BoolType>(Ty)) {
       Current = Class::Integer;
     } else if (const auto RT = mlir::dyn_cast<StructType>(Ty)) {
@@ -267,6 +282,65 @@ void X86_64ABIInfo::classify(mlir::Type Ty, uint64_t OffsetBase, Class &Lo,
   cir_cconv_unreachable("NYI");
 }
 
+ABIArgInfo X86_64ABIInfo::getIndirectResult(mlir::Type ty,
+                                            unsigned freeIntRegs) const {
+  // If this is a scalar LLVM value then assume LLVM will pass it in the right
+  // place naturally.
+  //
+  // This assumption is optimistic, as there could be free registers available
+  // when we need to pass this argument in memory, and LLVM could try to pass
+  // the argument in the free register. This does not seem to happen currently,
+  // but this code would be much safer if we could mark the argument with
+  // 'onstack'. See PR12193.
+  if (!isAggregateTypeForABI(ty) /* && IsIllegalVectorType(Ty) &&*/
+      /*!Ty->isBitIntType()*/) {
+    // FIXME: Handling enum type?
+
+    return (isPromotableIntegerTypeForABI(ty) ? ABIArgInfo::getExtend(ty)
+                                              : ABIArgInfo::getDirect());
+  }
+
+  if (CIRCXXABI::RecordArgABI RAA = getRecordArgABI(ty, getCXXABI()))
+    return getNaturalAlignIndirect(ty, RAA == CIRCXXABI::RAA_DirectInMemory);
+
+  // Compute the byval alignment. We specify the alignment of the byval in all
+  // cases so that the mid-level optimizer knows the alignment of the byval.
+  unsigned align = std::max(getContext().getTypeAlign(ty) / 8, 8U);
+
+  // Attempt to avoid passing indirect results using byval when possible. This
+  // is important for good codegen.
+  //
+  // We do this by coercing the value into a scalar type which the backend can
+  // handle naturally (i.e., without using byval).
+  //
+  // For simplicity, we currently only do this when we have exhausted all of the
+  // free integer registers. Doing this when there are free integer registers
+  // would require more care, as we would have to ensure that the coerced value
+  // did not claim the unused register. That would require either reording the
+  // arguments to the function (so that any subsequent inreg values came first),
+  // or only doing this optimization when there were no following arguments that
+  // might be inreg.
+  //
+  // We currently expect it to be rare (particularly in well written code) for
+  // arguments to be passed on the stack when there are still free integer
+  // registers available (this would typically imply large structs being passed
+  // by value), so this seems like a fair tradeoff for now.
+  //
+  // We can revisit this if the backend grows support for 'onstack' parameter
+  // attributes. See PR12193.
+  if (freeIntRegs == 0) {
+    uint64_t size = getContext().getTypeSize(ty);
+
+    // If this type fits in an eightbyte, coerce it into the matching integral
+    // type, which will end up on the stack (with alignment 8).
+    if (align == 8 && size <= 64)
+      return ABIArgInfo::getDirect(
+          cir::IntType::get(LT.getMLIRContext(), size, false));
+  }
+
+  return ABIArgInfo::getIndirect(align);
+}
+
 /// Return a type that will be passed by the backend in the low 8 bytes of an
 /// XMM register, corresponding to the SSE class.
 mlir::Type X86_64ABIInfo::GetSSETypeAtOffset(mlir::Type IRType,
@@ -278,7 +352,7 @@ mlir::Type X86_64ABIInfo::GetSSETypeAtOffset(mlir::Type IRType,
       (unsigned)getContext().getTypeSize(SourceTy) / 8 - SourceOffset;
   mlir::Type T0 = getFPTypeAtOffset(IRType, IROffset, TD);
   if (!T0 || mlir::isa<mlir::Float64Type>(T0))
-    return T0; // NOTE(cir): Not sure if this is correct.
+    return cir::DoubleType::get(LT.getMLIRContext());
 
   mlir::Type T1 = {};
   unsigned T0Size = TD.getTypeAllocSize(T0);
@@ -539,6 +613,22 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(
     ++neededSSE;
     break;
   }
+  // AMD64-ABI 3.2.3p3: Rule 1. If the class is MEMORY, pass the argument
+  // on the stack.
+  case Class::Memory:
+
+  // AMD64-ABI 3.2.3p3: Rule 5. If the class is X87, X87UP or
+  // COMPLEX_X87, it is passed in memory.
+  case Class::X87:
+  case Class::ComplexX87:
+    if (getRecordArgABI(Ty, getCXXABI()) == CIRCXXABI::RAA_Indirect)
+      ++neededInt;
+    return getIndirectResult(Ty, freeIntRegs);
+
+  case Class::SSEUp:
+  case Class::X87Up:
+    llvm_unreachable("Invalid classification for lo word.");
+
   default:
     cir_cconv_assert_or_abort(!cir::MissingFeatures::X86ArgTypeClassification(),
                               "NYI");
@@ -546,6 +636,11 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(
 
   mlir::Type HighPart = {};
   switch (Hi) {
+  case Class::Memory:
+  case Class::X87:
+  case Class::ComplexX87:
+    llvm_unreachable("Invalid classification for hi word.");
+
   case Class::NoClass:
     break;
 
@@ -558,8 +653,23 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(
       return ABIArgInfo::getDirect(HighPart, 8);
     break;
 
-  default:
-    cir_cconv_unreachable("NYI");
+  // X87Up generally doesn't occur here (long double is passed in
+  // memory), except in situations involving unions.
+  case Class::X87Up:
+  case Class::SSE:
+    ++neededSSE;
+    HighPart = GetSSETypeAtOffset(Ty, 8, Ty, 8);
+
+    if (Lo == Class::NoClass) // Pass HighPart at offset 8 in memory.
+      return ABIArgInfo::getDirect(HighPart, 8);
+    break;
+
+  // AMD64-ABI 3.2.3p3: Rule 4. If the class is SSEUP, the
+  // eightbyte is passed in the upper half of the last used SSE
+  // register.  This only happens when 128-bit vectors are passed.
+  case Class::SSEUp:
+    llvm_unreachable("NYI && We need to implement GetByteVectorType");
+    break;
   }
 
   // If a high part was specified, merge it together with the low part.  It is
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
index 201730519207..2ac4cdaa0309 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
@@ -69,6 +69,14 @@ class X86_64ABIInfo : public cir::ABIInfo {
                                     mlir::Type SourceTy,
                                     unsigned SourceOffset) const;
 
+  /// Give a source type \arg Ty, return a suitable result such that the
+  /// argument will be passed in memory.
+  ///
+  /// \param freeIntRegs - The number of free integer registers remaining
+  /// available.
+  ::cir::ABIArgInfo getIndirectResult(mlir::Type ty,
+                                      unsigned freeIntRegs) const;
+
   /// The 0.98 ABI revision clarified a lot of ambiguities,
   /// unfortunately in ways that were not always consistent with
   /// certain previous compilers.  In particular, platforms which
diff --git a/clang/test/CIR/Lowering/var-arg-x86_64.c b/clang/test/CIR/Lowering/var-arg-x86_64.c
index 992d5e82cd98..a445ac657c74 100644
--- a/clang/test/CIR/Lowering/var-arg-x86_64.c
+++ b/clang/test/CIR/Lowering/var-arg-x86_64.c
@@ -1,3 +1,4 @@
+// REQUIRES: system-linux
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -fno-clangir-call-conv-lowering %s -o %t.cir
 // RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -fno-clangir-call-conv-lowering %s -o %t.ll
@@ -76,3 +77,55 @@ double f1(int n, ...) {
 // CIR: [[CASTED_ARG_P:%.+]] = cir.cast(bitcast, [[ARG]]
 // CIR: [[CASTED_ARG:%.+]] = cir.load align(16) [[CASTED_ARG_P]]
 // CIR: store [[CASTED_ARG]], [[RES]]
+long double f2(int n, ...) {
+  va_list valist;
+  va_start(valist, n);
+  long double res = va_arg(valist, long double);
+  va_end(valist);
+  return res;
+}
+
+// CHECK: define {{.*}}@f2
+// CHECK: [[RESULT:%.+]] = alloca x86_fp80
+// CHECK: [[VA_LIST_ALLOCA:%.+]] = alloca {{.*}}[[VA_LIST_TYPE]]
+// CHECK: [[RES:%.+]] = alloca x86_fp80
+// CHECK: [[VA_LIST:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: call {{.*}}@llvm.va_start.p0(ptr [[VA_LIST]])
+// CHECK: [[VA_LIST2:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: [[OVERFLOW_AREA_P:%.+]] = getelementptr {{.*}} [[VA_LIST2]], i32 0, i32 2
+// CHECK: [[OVERFLOW_AREA:%.+]] = load ptr, ptr [[OVERFLOW_AREA_P]]
+// Ptr Mask Operations
+// CHECK: [[OVERFLOW_AREA_OFFSET_ALIGNED:%.+]] = getelementptr i8, ptr [[OVERFLOW_AREA]], i64 15
+// CHECK: [[OVERFLOW_AREA_OFFSET_ALIGNED_P:%.+]] = ptrtoint ptr [[OVERFLOW_AREA_OFFSET_ALIGNED]] to i32
+// CHECK: [[MASKED:%.+]] = and i32 [[OVERFLOW_AREA_OFFSET_ALIGNED_P]], -16
+// CHECK: [[DIFF:%.+]] = sub i32 [[OVERFLOW_AREA_OFFSET_ALIGNED_P]], [[MASKED]]
+// CHECK: [[PTR_MASKED:%.+]] = getelementptr i8, ptr [[OVERFLOW_AREA_OFFSET_ALIGNED]], i32 [[DIFF]]
+// CHECK: [[OVERFLOW_AREA_NEXT:%.+]] = getelementptr i8, ptr [[PTR_MASKED]], i64 16
+// CHECK: store ptr [[OVERFLOW_AREA_NEXT]], ptr [[OVERFLOW_AREA_P]]
+// CHECK: [[VALUE:%.+]] = load x86_fp80, ptr [[PTR_MASKED]]
+// CHECK: store x86_fp80 [[VALUE]], ptr [[RES]]
+// CHECK: [[VA_LIST2:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: call {{.*}}@llvm.va_end.p0(ptr [[VA_LIST2]])
+// CHECK: [[VALUE2:%.+]] = load x86_fp80, ptr [[RES]]
+// CHECK: store x86_fp80 [[VALUE2]], ptr [[RESULT]]
+// CHECK: [[RETURN_VALUE:%.+]] = load x86_fp80, ptr [[RESULT]]
+// CHECK: ret x86_fp80 [[RETURN_VALUE]]
+
+// CIR: cir.func @f2
+// CIR: [[VA_LIST_ALLOCA:%.+]] = cir.alloca !cir.array<!ty___va_list_tag x 1>, !cir.ptr<!cir.array<!ty___va_list_tag x 1>>, ["valist"]
+// CIR: [[RES:%.+]] = cir.alloca !cir.long_double<!cir.f80>, !cir.ptr<!cir.long_double<!cir.f80>>, ["res"
+// CIR: [[VASTED_VA_LIST:%.+]] = cir.cast(array_to_ptrdecay, [[VA_LIST_ALLOCA]] 
+// CIR: cir.va.start [[VASTED_VA_LIST]]
+// CIR: [[VASTED_VA_LIST:%.+]] = cir.cast(array_to_ptrdecay, [[VA_LIST_ALLOCA]] 
+// CIR: [[OVERFLOW_AREA_P:%.+]] = cir.get_member [[VASTED_VA_LIST]][2] {name = "overflow_arg_area"}
+// CIR: [[OVERFLOW_AREA:%.+]] = cir.load [[OVERFLOW_AREA_P]]
+// CIR: [[CASTED:%.+]] = cir.cast(bitcast, [[OVERFLOW_AREA]] : !cir.ptr<!void>)
+// CIR: [[CONSTANT:%.+]] = cir.const #cir.int<15>
+// CIR: [[PTR_STRIDE:%.+]] = cir.ptr_stride([[CASTED]] {{.*}}[[CONSTANT]]
+// CIR: [[MINUS_ALIGN:%.+]] = cir.const #cir.int<-16>
+// CIR: [[ALIGNED:%.+]] = cir.ptr_mask([[PTR_STRIDE]], [[MINUS_ALIGN]]
+// CIR: [[ALIGN:%.+]] = cir.const #cir.int<16>
+// CIR: [[CAST_ALIGNED:%.+]] = cir.cast(bitcast, [[ALIGNED]] : !cir.ptr<!u8i>), !cir.ptr<!cir.long_double<!cir.f80>>
+// CIR: [[CAST_ALIGNED_VALUE:%.+]] = cir.load [[CAST_ALIGNED]]
+// CIR: cir.store [[CAST_ALIGNED_VALUE]], [[RES]]
+// CIR. cir.via.end