Skip to content

[Clang] Add warnings when mixing different charN_t types #138708

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,10 @@ Improvements to Clang's diagnostics
- ``-Wreserved-identifier`` now fires on reserved parameter names in a function
declaration which is not a definition.

- A new ``-Wcharacter-conversion`` warns where comparing or implicitly converting
between different Unicode character types (``char8_t``, ``char16_t``, ``char32_t``).
This warning only triggers in C++ as these types are aliases in C. (#GH138526)

Improvements to Clang's time-trace
----------------------------------

Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/AST/ASTDiagnostic.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ namespace clang {
/// is initialized before passing it in.
QualType desugarForDiagnostic(ASTContext &Context, QualType QT,
bool &ShouldAKA);

std::string FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T);

} // end namespace clang

#endif
1 change: 1 addition & 0 deletions clang/include/clang/AST/Type.h
Original file line number Diff line number Diff line change
Expand Up @@ -2524,6 +2524,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
bool isChar16Type() const;
bool isChar32Type() const;
bool isAnyCharacterType() const;
bool isUnicodeCharacterType() const;
bool isIntegralType(const ASTContext &Ctx) const;

/// Determine whether this type is an integral or enumeration type.
Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/DiagnosticGroups.td
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def EnumConversion : DiagGroup<"enum-conversion",
ImplicitEnumEnumCast,
EnumFloatConversion,
EnumCompareConditional]>;
def CharacterConversion : DiagGroup<"character-conversion">;
def DeprecatedOFast : DiagGroup<"deprecated-ofast">;
def ObjCSignedCharBoolImplicitIntConversion :
DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
Expand Down Expand Up @@ -1073,6 +1074,7 @@ def Parentheses : DiagGroup<"parentheses",
// - __null-to-integer conversion warnings are on by default
def Conversion : DiagGroup<"conversion",
[BoolConversion,
CharacterConversion,
ConstantConversion,
EnumConversion,
BitFieldEnumConversion,
Expand Down
30 changes: 29 additions & 1 deletion clang/include/clang/Basic/DiagnosticSemaKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -4357,6 +4357,29 @@ def warn_address_of_reference_bool_conversion : Warning<
"code; pointer may be assumed to always convert to true">,
InGroup<UndefinedBoolConversion>;

def warn_impcast_unicode_char_type
: Warning<"implicit conversion from %0 to %1 may change the meaning of the "
"represented code unit">,
InGroup<CharacterConversion>;
def warn_impcast_unicode_precision
: Warning<"implicit conversion from %0 to %1 may lose precision and change "
"the meaning of the represented code unit">,
InGroup<CharacterConversion>;
def warn_impcast_unicode_char_type_constant
: Warning<"implicit conversion from %0 to %1 changes the meaning of the "
"%select{code unit|code point}2 '%3'">,
InGroup<CharacterConversion>;

def warn_comparison_unicode_mixed_types
: Warning<"comparing values of different Unicode code unit types %0 and %1 "
"may compare different code points">,
InGroup<CharacterConversion>;

def warn_comparison_unicode_mixed_types_constant
: Warning<"comparing values of different Unicode code unit types %0 and %1 "
"compares unrelated code units '%2' and '%3'">,
InGroup<CharacterConversion>;

def warn_xor_used_as_pow : Warning<
"result of '%0' is %1; did you mean exponentiation?">,
InGroup<XorUsedAsPow>;
Expand Down Expand Up @@ -6820,7 +6843,7 @@ def err_counted_by_on_incomplete_type_on_use : Error <

def note_counted_by_consider_completing_pointee_ty : Note<
"consider providing a complete definition for %0">;

def note_counted_by_consider_using_sized_by : Note<
"consider using '__sized_by%select{|_or_null}0' instead of "
"'__counted_by%select{|_or_null}0'">;
Expand Down Expand Up @@ -7719,6 +7742,11 @@ def warn_comparison_of_mixed_enum_types_switch : Warning<
"%diff{ ($ and $)|}0,1">,
InGroup<EnumCompareSwitch>;

def warn_arith_conv_mixed_unicode_types
: Warning<"%sub{select_arith_conv_kind}0 "
"different Unicode character types %1 and %2">,
InGroup<CharacterConversion>;

def err_typecheck_assign_const : Error<
"%select{"
"cannot assign to return value because function %1 returns a const value|"
Expand Down
30 changes: 30 additions & 0 deletions clang/lib/AST/ASTDiagnostic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include "clang/AST/TemplateBase.h"
#include "clang/AST/Type.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"

using namespace clang;
Expand Down Expand Up @@ -2190,3 +2192,31 @@ static bool FormatTemplateTypeDiff(ASTContext &Context, QualType FromType,
TD.DiffTemplate();
return TD.Emit();
}

std::string clang::FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T) {
auto IsSingleCodeUnitCP = [](unsigned Value, QualType T) {
if (T->isChar8Type()) {
assert(Value <= 0xFF && "not a valid UTF-8 code unit");
return Value <= 0x7F;
}
if (T->isChar16Type()) {
assert(Value <= 0xFFFF && "not a valid UTF-16 code unit");
return llvm::IsSingleCodeUnitUTF16Codepoint(Value);
}
assert(T->isChar32Type());
return llvm::IsSingleCodeUnitUTF32Codepoint(Value);
};
llvm::SmallVector<char, 16> Str;
if (!IsSingleCodeUnitCP(Value, T)) {
llvm::raw_svector_ostream OS(Str);
OS << "<" << llvm::format_hex(Value, 1, /*Upper=*/true) << ">";
return std::string(Str.begin(), Str.end());
}

char Buffer[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
char *Ptr = Buffer;
[[maybe_unused]] bool Converted = llvm::ConvertCodePointToUTF8(Value, Ptr);
assert(Converted && "trying to encode invalid code unit");
EscapeStringForDiagnostic(StringRef(Buffer, Ptr - Buffer), Str);
return std::string(Str.begin(), Str.end());
}
14 changes: 14 additions & 0 deletions clang/lib/AST/Type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2193,6 +2193,20 @@ bool Type::isAnyCharacterType() const {
}
}

bool Type::isUnicodeCharacterType() const {
const auto *BT = dyn_cast<BuiltinType>(CanonicalType);
if (!BT)
return false;
switch (BT->getKind()) {
default:
return false;
case BuiltinType::Char8:
case BuiltinType::Char16:
case BuiltinType::Char32:
return true;
}
}

/// isSignedIntegerType - Return true if this is an integer type that is
/// signed, according to C99 6.2.5p4 [char, signed char, short, int, long..],
/// an enum decl which has a signed representation
Expand Down
47 changes: 47 additions & 0 deletions clang/lib/Sema/SemaChecking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "CheckExprLifetime.h"
#include "clang/AST/APValue.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/ASTDiagnostic.h"
#include "clang/AST/Attr.h"
#include "clang/AST/AttrIterator.h"
#include "clang/AST/CharUnits.h"
Expand Down Expand Up @@ -11810,6 +11811,47 @@ static void DiagnoseIntInBoolContext(Sema &S, Expr *E) {
}
}

static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source,
const Type *Target, Expr *E,
QualType T,
SourceLocation CC) {
assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() &&
Source != Target);
Expr::EvalResult Result;
if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects,
S.isConstantEvaluatedContext())) {
llvm::APSInt Value(32);
Value = Result.Val.getInt();
bool IsASCII = Value <= 0x7F;
bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF);
bool ConversionPreservesSemantics =
IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP);

if (!ConversionPreservesSemantics) {
auto IsSingleCodeUnitCP = [](const QualType &T,
const llvm::APSInt &Value) {
if (T->isChar8Type())
return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
if (T->isChar16Type())
return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
assert(T->isChar32Type());
return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
};

S.Diag(CC, diag::warn_impcast_unicode_char_type_constant)
<< E->getType() << T
<< IsSingleCodeUnitCP(E->getType().getUnqualifiedType(), Value)
<< FormatUTFCodeUnitAsCodepoint(Value.getExtValue(), E->getType());
}
} else {
bool LosesPrecision = S.getASTContext().getIntWidth(E->getType()) >
S.getASTContext().getIntWidth(T);
DiagnoseImpCast(S, E, T, CC,
LosesPrecision ? diag::warn_impcast_unicode_precision
: diag::warn_impcast_unicode_char_type);
}
}

void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
bool *ICContext, bool IsListInit) {
if (E->isTypeDependent() || E->isValueDependent()) return;
Expand Down Expand Up @@ -12147,6 +12189,11 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,

DiscardMisalignedMemberAddress(Target, E);

if (Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) {
DiagnoseMixedUnicodeImplicitConversion(*this, Source, Target, E, T, CC);
return;
}

if (Target->isBooleanType())
DiagnoseIntInBoolContext(*this, E);

Expand Down
77 changes: 77 additions & 0 deletions clang/lib/Sema/SemaExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "UsedDeclVisitor.h"
#include "clang/AST/ASTConsumer.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/ASTDiagnostic.h"
#include "clang/AST/ASTLambda.h"
#include "clang/AST/ASTMutationListener.h"
#include "clang/AST/CXXInheritance.h"
Expand Down Expand Up @@ -1567,15 +1568,91 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS,
}
}

static void CheckUnicodeArithmeticConversions(Sema &SemaRef, Expr *LHS,
Expr *RHS, SourceLocation Loc,
ArithConvKind ACK) {
QualType LHSType = LHS->getType().getUnqualifiedType();
QualType RHSType = RHS->getType().getUnqualifiedType();

if (!SemaRef.getLangOpts().CPlusPlus || !LHSType->isUnicodeCharacterType() ||
!RHSType->isUnicodeCharacterType())
return;

if (ACK == ArithConvKind::Comparison) {
if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
return;

auto IsSingleCodeUnitCP = [](const QualType &T, const llvm::APSInt &Value) {
if (T->isChar8Type())
return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
if (T->isChar16Type())
return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
assert(T->isChar32Type());
return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
};

Expr::EvalResult LHSRes, RHSRes;
bool LHSSuccess = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(),
Expr::SE_AllowSideEffects,
SemaRef.isConstantEvaluatedContext());
bool RHSuccess = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(),
Expr::SE_AllowSideEffects,
SemaRef.isConstantEvaluatedContext());

// Don't warn if the one known value is a representable
// in the type of both expressions.
if (LHSSuccess != RHSuccess) {
Expr::EvalResult &Res = LHSSuccess ? LHSRes : RHSRes;
if (IsSingleCodeUnitCP(LHSType, Res.Val.getInt()) &&
IsSingleCodeUnitCP(RHSType, Res.Val.getInt()))
return;
}

if (!LHSSuccess || !RHSuccess) {
SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types)
<< LHS->getSourceRange() << RHS->getSourceRange() << LHSType
<< RHSType;
return;
}

llvm::APSInt LHSValue(32);
LHSValue = LHSRes.Val.getInt();
llvm::APSInt RHSValue(32);
RHSValue = RHSRes.Val.getInt();

bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue);
bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue);
if (LHSSafe && RHSSafe)
return;

SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant)
<< LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType
<< FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType)
<< FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType);
return;
}

if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
return;

SemaRef.Diag(Loc, diag::warn_arith_conv_mixed_unicode_types)
<< LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType
<< RHSType;
return;
}

/// UsualArithmeticConversions - Performs various conversions that are common to
/// binary operators (C99 6.3.1.8). If both operands aren't arithmetic, this
/// routine returns the first non-arithmetic type found. The client is
/// responsible for emitting appropriate error diagnostics.
QualType Sema::UsualArithmeticConversions(ExprResult &LHS, ExprResult &RHS,
SourceLocation Loc,
ArithConvKind ACK) {

checkEnumArithmeticConversions(LHS.get(), RHS.get(), Loc, ACK);

CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(), Loc, ACK);

if (ACK != ArithConvKind::CompAssign) {
LHS = UsualUnaryConversions(LHS.get());
if (LHS.isInvalid())
Expand Down
Loading