Skip to content

Commit 381a649

Browse files
authored
[Clang] Add warnings when mixing different charN_t types (llvm#138708)
charN_t represent code units of different UTF encodings. Therefore the values of 2 different charN_t objects do not represent the same characters. In order to avoid comparing apples and oranges, we add new warnings to warn on: - Implicit conversions - Comparisons - Other cases involving arithmetic conversions We only produce the warning if we cannot establish the comparison would be safe through constant evaluation. The new `-Wimplicit-unicode-conversion` warning is enabled by default. Note that this PR intentionally doesn;t touches char/wchar_t, but it would be worth considering also warning on extending the new warnings to these types (in a follow up) Additionally most arithmetic operations on charN_t don't really make sense (ie what does it mean to addition code units), so we could add warnings for that. Fixes llvm#138526
1 parent 540cf25 commit 381a649

File tree

22 files changed

+392
-14
lines changed

22 files changed

+392
-14
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,10 @@ Improvements to Clang's diagnostics
534534
packing may differ under the MS struct ABI (#GH117428).
535535

536536

537+
- A new ``-Wcharacter-conversion`` warns where comparing or implicitly converting
538+
between different Unicode character types (``char8_t``, ``char16_t``, ``char32_t``).
539+
This warning only triggers in C++ as these types are aliases in C. (#GH138526)
540+
537541
Improvements to Clang's time-trace
538542
----------------------------------
539543

clang/include/clang/AST/ASTDiagnostic.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ namespace clang {
3838
/// is initialized before passing it in.
3939
QualType desugarForDiagnostic(ASTContext &Context, QualType QT,
4040
bool &ShouldAKA);
41+
42+
std::string FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T);
43+
4144
} // end namespace clang
4245

4346
#endif

clang/include/clang/AST/Type.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2521,6 +2521,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
25212521
bool isChar16Type() const;
25222522
bool isChar32Type() const;
25232523
bool isAnyCharacterType() const;
2524+
bool isUnicodeCharacterType() const;
25242525
bool isIntegralType(const ASTContext &Ctx) const;
25252526

25262527
/// Determine whether this type is an integral or enumeration type.

clang/include/clang/Basic/DiagnosticGroups.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ def EnumConversion : DiagGroup<"enum-conversion",
111111
ImplicitEnumEnumCast,
112112
EnumFloatConversion,
113113
EnumCompareConditional]>;
114+
def CharacterConversion : DiagGroup<"character-conversion">;
114115
def DeprecatedOFast : DiagGroup<"deprecated-ofast">;
115116
def ObjCSignedCharBoolImplicitIntConversion :
116117
DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
@@ -1119,6 +1120,7 @@ def Parentheses : DiagGroup<"parentheses",
11191120
// - __null-to-integer conversion warnings are on by default
11201121
def Conversion : DiagGroup<"conversion",
11211122
[BoolConversion,
1123+
CharacterConversion,
11221124
ConstantConversion,
11231125
EnumConversion,
11241126
BitFieldEnumConversion,

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4369,6 +4369,29 @@ def warn_address_of_reference_bool_conversion : Warning<
43694369
"code; pointer may be assumed to always convert to true">,
43704370
InGroup<UndefinedBoolConversion>;
43714371

4372+
def warn_impcast_unicode_char_type
4373+
: Warning<"implicit conversion from %0 to %1 may change the meaning of the "
4374+
"represented code unit">,
4375+
InGroup<CharacterConversion>;
4376+
def warn_impcast_unicode_precision
4377+
: Warning<"implicit conversion from %0 to %1 may lose precision and change "
4378+
"the meaning of the represented code unit">,
4379+
InGroup<CharacterConversion>;
4380+
def warn_impcast_unicode_char_type_constant
4381+
: Warning<"implicit conversion from %0 to %1 changes the meaning of the "
4382+
"%select{code unit|code point}2 '%3'">,
4383+
InGroup<CharacterConversion>;
4384+
4385+
def warn_comparison_unicode_mixed_types
4386+
: Warning<"comparing values of different Unicode code unit types %0 and %1 "
4387+
"may compare different code points">,
4388+
InGroup<CharacterConversion>;
4389+
4390+
def warn_comparison_unicode_mixed_types_constant
4391+
: Warning<"comparing values of different Unicode code unit types %0 and %1 "
4392+
"compares unrelated code units '%2' and '%3'">,
4393+
InGroup<CharacterConversion>;
4394+
43724395
def warn_xor_used_as_pow : Warning<
43734396
"result of '%0' is %1; did you mean exponentiation?">,
43744397
InGroup<XorUsedAsPow>;
@@ -6834,7 +6857,7 @@ def err_counted_by_on_incomplete_type_on_use : Error <
68346857

68356858
def note_counted_by_consider_completing_pointee_ty : Note<
68366859
"consider providing a complete definition for %0">;
6837-
6860+
68386861
def note_counted_by_consider_using_sized_by : Note<
68396862
"consider using '__sized_by%select{|_or_null}0' instead of "
68406863
"'__counted_by%select{|_or_null}0'">;
@@ -7733,6 +7756,11 @@ def warn_comparison_of_mixed_enum_types_switch : Warning<
77337756
"%diff{ ($ and $)|}0,1">,
77347757
InGroup<EnumCompareSwitch>;
77357758

7759+
def warn_arith_conv_mixed_unicode_types
7760+
: Warning<"%sub{select_arith_conv_kind}0 "
7761+
"different Unicode character types %1 and %2">,
7762+
InGroup<CharacterConversion>;
7763+
77367764
def err_typecheck_assign_const : Error<
77377765
"%select{"
77387766
"cannot assign to return value because function %1 returns a const value|"

clang/lib/AST/ASTDiagnostic.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
#include "clang/AST/TemplateBase.h"
2121
#include "clang/AST/Type.h"
2222
#include "llvm/ADT/StringExtras.h"
23+
#include "llvm/Support/ConvertUTF.h"
24+
#include "llvm/Support/Format.h"
2325
#include "llvm/Support/raw_ostream.h"
2426

2527
using namespace clang;
@@ -2190,3 +2192,31 @@ static bool FormatTemplateTypeDiff(ASTContext &Context, QualType FromType,
21902192
TD.DiffTemplate();
21912193
return TD.Emit();
21922194
}
2195+
2196+
std::string clang::FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T) {
2197+
auto IsSingleCodeUnitCP = [](unsigned Value, QualType T) {
2198+
if (T->isChar8Type()) {
2199+
assert(Value <= 0xFF && "not a valid UTF-8 code unit");
2200+
return Value <= 0x7F;
2201+
}
2202+
if (T->isChar16Type()) {
2203+
assert(Value <= 0xFFFF && "not a valid UTF-16 code unit");
2204+
return llvm::IsSingleCodeUnitUTF16Codepoint(Value);
2205+
}
2206+
assert(T->isChar32Type());
2207+
return llvm::IsSingleCodeUnitUTF32Codepoint(Value);
2208+
};
2209+
llvm::SmallVector<char, 16> Str;
2210+
if (!IsSingleCodeUnitCP(Value, T)) {
2211+
llvm::raw_svector_ostream OS(Str);
2212+
OS << "<" << llvm::format_hex(Value, 1, /*Upper=*/true) << ">";
2213+
return std::string(Str.begin(), Str.end());
2214+
}
2215+
2216+
char Buffer[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
2217+
char *Ptr = Buffer;
2218+
[[maybe_unused]] bool Converted = llvm::ConvertCodePointToUTF8(Value, Ptr);
2219+
assert(Converted && "trying to encode invalid code unit");
2220+
EscapeStringForDiagnostic(StringRef(Buffer, Ptr - Buffer), Str);
2221+
return std::string(Str.begin(), Str.end());
2222+
}

clang/lib/AST/Type.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2193,6 +2193,20 @@ bool Type::isAnyCharacterType() const {
21932193
}
21942194
}
21952195

2196+
bool Type::isUnicodeCharacterType() const {
2197+
const auto *BT = dyn_cast<BuiltinType>(CanonicalType);
2198+
if (!BT)
2199+
return false;
2200+
switch (BT->getKind()) {
2201+
default:
2202+
return false;
2203+
case BuiltinType::Char8:
2204+
case BuiltinType::Char16:
2205+
case BuiltinType::Char32:
2206+
return true;
2207+
}
2208+
}
2209+
21962210
/// isSignedIntegerType - Return true if this is an integer type that is
21972211
/// signed, according to C99 6.2.5p4 [char, signed char, short, int, long..],
21982212
/// an enum decl which has a signed representation

clang/lib/Sema/SemaChecking.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "CheckExprLifetime.h"
1515
#include "clang/AST/APValue.h"
1616
#include "clang/AST/ASTContext.h"
17+
#include "clang/AST/ASTDiagnostic.h"
1718
#include "clang/AST/Attr.h"
1819
#include "clang/AST/AttrIterator.h"
1920
#include "clang/AST/CharUnits.h"
@@ -11871,6 +11872,47 @@ static void DiagnoseIntInBoolContext(Sema &S, Expr *E) {
1187111872
}
1187211873
}
1187311874

11875+
static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source,
11876+
const Type *Target, Expr *E,
11877+
QualType T,
11878+
SourceLocation CC) {
11879+
assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() &&
11880+
Source != Target);
11881+
Expr::EvalResult Result;
11882+
if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects,
11883+
S.isConstantEvaluatedContext())) {
11884+
llvm::APSInt Value(32);
11885+
Value = Result.Val.getInt();
11886+
bool IsASCII = Value <= 0x7F;
11887+
bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF);
11888+
bool ConversionPreservesSemantics =
11889+
IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP);
11890+
11891+
if (!ConversionPreservesSemantics) {
11892+
auto IsSingleCodeUnitCP = [](const QualType &T,
11893+
const llvm::APSInt &Value) {
11894+
if (T->isChar8Type())
11895+
return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
11896+
if (T->isChar16Type())
11897+
return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
11898+
assert(T->isChar32Type());
11899+
return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
11900+
};
11901+
11902+
S.Diag(CC, diag::warn_impcast_unicode_char_type_constant)
11903+
<< E->getType() << T
11904+
<< IsSingleCodeUnitCP(E->getType().getUnqualifiedType(), Value)
11905+
<< FormatUTFCodeUnitAsCodepoint(Value.getExtValue(), E->getType());
11906+
}
11907+
} else {
11908+
bool LosesPrecision = S.getASTContext().getIntWidth(E->getType()) >
11909+
S.getASTContext().getIntWidth(T);
11910+
DiagnoseImpCast(S, E, T, CC,
11911+
LosesPrecision ? diag::warn_impcast_unicode_precision
11912+
: diag::warn_impcast_unicode_char_type);
11913+
}
11914+
}
11915+
1187411916
void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
1187511917
bool *ICContext, bool IsListInit) {
1187611918
if (E->isTypeDependent() || E->isValueDependent()) return;
@@ -12208,6 +12250,11 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
1220812250

1220912251
DiscardMisalignedMemberAddress(Target, E);
1221012252

12253+
if (Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) {
12254+
DiagnoseMixedUnicodeImplicitConversion(*this, Source, Target, E, T, CC);
12255+
return;
12256+
}
12257+
1221112258
if (Target->isBooleanType())
1221212259
DiagnoseIntInBoolContext(*this, E);
1221312260

clang/lib/Sema/SemaExpr.cpp

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "UsedDeclVisitor.h"
1616
#include "clang/AST/ASTConsumer.h"
1717
#include "clang/AST/ASTContext.h"
18+
#include "clang/AST/ASTDiagnostic.h"
1819
#include "clang/AST/ASTLambda.h"
1920
#include "clang/AST/ASTMutationListener.h"
2021
#include "clang/AST/CXXInheritance.h"
@@ -1568,15 +1569,91 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS,
15681569
}
15691570
}
15701571

1572+
static void CheckUnicodeArithmeticConversions(Sema &SemaRef, Expr *LHS,
1573+
Expr *RHS, SourceLocation Loc,
1574+
ArithConvKind ACK) {
1575+
QualType LHSType = LHS->getType().getUnqualifiedType();
1576+
QualType RHSType = RHS->getType().getUnqualifiedType();
1577+
1578+
if (!SemaRef.getLangOpts().CPlusPlus || !LHSType->isUnicodeCharacterType() ||
1579+
!RHSType->isUnicodeCharacterType())
1580+
return;
1581+
1582+
if (ACK == ArithConvKind::Comparison) {
1583+
if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
1584+
return;
1585+
1586+
auto IsSingleCodeUnitCP = [](const QualType &T, const llvm::APSInt &Value) {
1587+
if (T->isChar8Type())
1588+
return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
1589+
if (T->isChar16Type())
1590+
return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
1591+
assert(T->isChar32Type());
1592+
return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
1593+
};
1594+
1595+
Expr::EvalResult LHSRes, RHSRes;
1596+
bool LHSSuccess = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(),
1597+
Expr::SE_AllowSideEffects,
1598+
SemaRef.isConstantEvaluatedContext());
1599+
bool RHSuccess = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(),
1600+
Expr::SE_AllowSideEffects,
1601+
SemaRef.isConstantEvaluatedContext());
1602+
1603+
// Don't warn if the one known value is a representable
1604+
// in the type of both expressions.
1605+
if (LHSSuccess != RHSuccess) {
1606+
Expr::EvalResult &Res = LHSSuccess ? LHSRes : RHSRes;
1607+
if (IsSingleCodeUnitCP(LHSType, Res.Val.getInt()) &&
1608+
IsSingleCodeUnitCP(RHSType, Res.Val.getInt()))
1609+
return;
1610+
}
1611+
1612+
if (!LHSSuccess || !RHSuccess) {
1613+
SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types)
1614+
<< LHS->getSourceRange() << RHS->getSourceRange() << LHSType
1615+
<< RHSType;
1616+
return;
1617+
}
1618+
1619+
llvm::APSInt LHSValue(32);
1620+
LHSValue = LHSRes.Val.getInt();
1621+
llvm::APSInt RHSValue(32);
1622+
RHSValue = RHSRes.Val.getInt();
1623+
1624+
bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue);
1625+
bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue);
1626+
if (LHSSafe && RHSSafe)
1627+
return;
1628+
1629+
SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant)
1630+
<< LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType
1631+
<< FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType)
1632+
<< FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType);
1633+
return;
1634+
}
1635+
1636+
if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
1637+
return;
1638+
1639+
SemaRef.Diag(Loc, diag::warn_arith_conv_mixed_unicode_types)
1640+
<< LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType
1641+
<< RHSType;
1642+
return;
1643+
}
1644+
15711645
/// UsualArithmeticConversions - Performs various conversions that are common to
15721646
/// binary operators (C99 6.3.1.8). If both operands aren't arithmetic, this
15731647
/// routine returns the first non-arithmetic type found. The client is
15741648
/// responsible for emitting appropriate error diagnostics.
15751649
QualType Sema::UsualArithmeticConversions(ExprResult &LHS, ExprResult &RHS,
15761650
SourceLocation Loc,
15771651
ArithConvKind ACK) {
1652+
15781653
checkEnumArithmeticConversions(LHS.get(), RHS.get(), Loc, ACK);
15791654

1655+
CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(), Loc, ACK);
1656+
15801657
if (ACK != ArithConvKind::CompAssign) {
15811658
LHS = UsualUnaryConversions(LHS.get());
15821659
if (LHS.isInvalid())

0 commit comments

Comments
 (0)