@@ -1432,6 +1432,95 @@ static const size_t minimumSearchBufferSize = 8192;
1432
1432
static bool searcherInUse;
1433
1433
#endif
1434
1434
1435
+ // Tailored collation rules for Japanese text search.
1436
+ // The default Unicode Collation Algorithm is unnatural in Japanese.
1437
+ // These rules intend to treat the following characters as different characters.
1438
+ //
1439
+ // - Small kana letters and normal kana letters
1440
+ // - Voiceless letters, voiced letters and semi-voiced letters
1441
+ //
1442
+ // This is original work built in reference to the following Unicode standard documents.
1443
+ //
1444
+ // - http://unicode.org/reports/tr10/
1445
+ // - http://unicode.org/Public/UCA/latest/allkeys.txt
1446
+ //
1447
+ static const UChar japaneseKanaCollationRules[] = {
1448
+ ' &' , 0x3041 , ' =' , 0x30a1 , ' =' , 0xff67 , ' <' , 0x3042 ,
1449
+ ' =' , 0x30a2 , ' =' , 0xff71 , ' <' , 0x3043 , ' =' , 0x30a3 ,
1450
+ ' =' , 0xff68 , ' <' , 0x3044 , ' =' , 0x30a4 , ' =' , 0xff72 ,
1451
+ ' <' , 0x3045 , ' =' , 0x30a5 , ' =' , 0xff69 , ' <' , 0x3046 ,
1452
+ ' =' , 0x30a6 , ' =' , 0xff73 , ' <' , 0x3094 , ' =' , 0x30f4 ,
1453
+ ' <' , 0x3047 , ' =' , 0x30a7 , ' =' , 0xff6a , ' <' , 0x3048 ,
1454
+ ' =' , 0x30a8 , ' =' , 0xff74 , ' <' , 0x3049 , ' =' , 0x30a9 ,
1455
+ ' =' , 0xff6b , ' <' , 0x304a , ' =' , 0x30aa , ' =' , 0xff75 ,
1456
+ ' <' , 0x3095 , ' =' , 0x30f5 , ' <' , 0x304b , ' =' , 0x30ab ,
1457
+ ' =' , 0xff76 , ' <' , 0x304c , ' =' , 0x30ac , ' <' , 0x304d ,
1458
+ ' =' , 0x30ad , ' =' , 0xff77 , ' <' , 0x304e , ' =' , 0x30ae ,
1459
+ ' <' , 0x304f , ' =' , 0x30af , ' =' , 0xff78 , ' <' , 0x3050 ,
1460
+ ' =' , 0x30b0 , ' <' , 0x3096 , ' =' , 0x30f6 , ' <' , 0x3051 ,
1461
+ ' =' , 0x30b1 , ' =' , 0xff79 , ' <' , 0x3052 , ' =' , 0x30b2 ,
1462
+ ' <' , 0x3053 , ' =' , 0x30b3 , ' =' , 0xff7a , ' <' , 0x3054 ,
1463
+ ' =' , 0x30b4 , ' <' , 0x3055 , ' =' , 0x30b5 , ' =' , 0xff7b ,
1464
+ ' <' , 0x3056 , ' =' , 0x30b6 , ' <' , 0x3057 , ' =' , 0x30b7 ,
1465
+ ' =' , 0xff7c , ' <' , 0x3058 , ' =' , 0x30b8 , ' <' , 0x3059 ,
1466
+ ' =' , 0x30b9 , ' =' , 0xff7d , ' <' , 0x305a , ' =' , 0x30ba ,
1467
+ ' <' , 0x305b , ' =' , 0x30bb , ' =' , 0xff7e , ' <' , 0x305c ,
1468
+ ' =' , 0x30bc , ' <' , 0x305d , ' =' , 0x30bd , ' =' , 0xff7f ,
1469
+ ' <' , 0x305e , ' =' , 0x30be , ' <' , 0x305f , ' =' , 0x30bf ,
1470
+ ' =' , 0xff80 , ' <' , 0x3060 , ' =' , 0x30c0 , ' <' , 0x3061 ,
1471
+ ' =' , 0x30c1 , ' =' , 0xff81 , ' <' , 0x3062 , ' =' , 0x30c2 ,
1472
+ ' <' , 0x3063 , ' =' , 0x30c3 , ' =' , 0xff6f , ' <' , 0x3064 ,
1473
+ ' =' , 0x30c4 , ' =' , 0xff82 , ' <' , 0x3065 , ' =' , 0x30c5 ,
1474
+ ' <' , 0x3066 , ' =' , 0x30c6 , ' =' , 0xff83 , ' <' , 0x3067 ,
1475
+ ' =' , 0x30c7 , ' <' , 0x3068 , ' =' , 0x30c8 , ' =' , 0xff84 ,
1476
+ ' <' , 0x3069 , ' =' , 0x30c9 , ' <' , 0x306a , ' =' , 0x30ca ,
1477
+ ' =' , 0xff85 , ' <' , 0x306b , ' =' , 0x30cb , ' =' , 0xff86 ,
1478
+ ' <' , 0x306c , ' =' , 0x30cc , ' =' , 0xff87 , ' <' , 0x306d ,
1479
+ ' =' , 0x30cd , ' =' , 0xff88 , ' <' , 0x306e , ' =' , 0x30ce ,
1480
+ ' =' , 0xff89 , ' <' , 0x306f , ' =' , 0x30cf , ' =' , 0xff8a ,
1481
+ ' <' , 0x3070 , ' =' , 0x30d0 , ' <' , 0x3071 , ' =' , 0x30d1 ,
1482
+ ' <' , 0x3072 , ' =' , 0x30d2 , ' =' , 0xff8b , ' <' , 0x3073 ,
1483
+ ' =' , 0x30d3 , ' <' , 0x3074 , ' =' , 0x30d4 , ' <' , 0x3075 ,
1484
+ ' =' , 0x30d5 , ' =' , 0xff8c , ' <' , 0x3076 , ' =' , 0x30d6 ,
1485
+ ' <' , 0x3077 , ' =' , 0x30d7 , ' <' , 0x3078 , ' =' , 0x30d8 ,
1486
+ ' =' , 0xff8d , ' <' , 0x3079 , ' =' , 0x30d9 , ' <' , 0x307a ,
1487
+ ' =' , 0x30da , ' <' , 0x307b , ' =' , 0x30db , ' =' , 0xff8e ,
1488
+ ' <' , 0x307c , ' =' , 0x30dc , ' <' , 0x307d , ' =' , 0x30dd ,
1489
+ ' <' , 0x307e , ' =' , 0x30de , ' =' , 0xff8f , ' <' , 0x307f ,
1490
+ ' =' , 0x30df , ' =' , 0xff90 , ' <' , 0x3080 , ' =' , 0x30e0 ,
1491
+ ' =' , 0xff91 , ' <' , 0x3081 , ' =' , 0x30e1 , ' =' , 0xff92 ,
1492
+ ' <' , 0x3082 , ' =' , 0x30e2 , ' =' , 0xff93 , ' <' , 0x3083 ,
1493
+ ' =' , 0x30e3 , ' =' , 0xff6c , ' <' , 0x3084 , ' =' , 0x30e4 ,
1494
+ ' =' , 0xff94 , ' <' , 0x3085 , ' =' , 0x30e5 , ' =' , 0xff6d ,
1495
+ ' <' , 0x3086 , ' =' , 0x30e6 , ' =' , 0xff95 , ' <' , 0x3087 ,
1496
+ ' =' , 0x30e7 , ' =' , 0xff6e , ' <' , 0x3088 , ' =' , 0x30e8 ,
1497
+ ' =' , 0xff96 , ' <' , 0x3089 , ' =' , 0x30e9 , ' =' , 0xff97 ,
1498
+ ' <' , 0x308a , ' =' , 0x30ea , ' =' , 0xff98 , ' <' , 0x308b ,
1499
+ ' =' , 0x30eb , ' =' , 0xff99 , ' <' , 0x308c , ' =' , 0x30ec ,
1500
+ ' =' , 0xff9a , ' <' , 0x308d , ' =' , 0x30ed , ' =' , 0xff9b ,
1501
+ ' <' , 0x308e , ' =' , 0x30ee , ' <' , 0x308f , ' =' , 0x30ef ,
1502
+ ' =' , 0xff9c , ' <' , 0x30f7 , ' <' , 0x3090 , ' =' , 0x30f0 ,
1503
+ ' <' , 0x30f8 , ' <' , 0x3091 , ' =' , 0x30f1 , ' <' , 0x3092 ,
1504
+ ' =' , 0x30f2 , ' =' , 0xff66 , ' <' , 0x3093 , ' =' , 0x30f3 ,
1505
+ ' =' , 0xff9d , 0
1506
+ };
1507
+
1508
+ static UCollator* createCollator ()
1509
+ {
1510
+ // Set tailored collation rules to fix Japanese text search.
1511
+ // See the comments before japaneseKanaCollationRules for details.
1512
+ UErrorCode status = U_ZERO_ERROR;
1513
+ UCollator* collator = ucol_openRules (japaneseKanaCollationRules, -1 , UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, 0 , &status);
1514
+ ASSERT (status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1515
+ return collator;
1516
+ }
1517
+
1518
+ static UCollator* collator ()
1519
+ {
1520
+ static UCollator* collator = createCollator ();
1521
+ return collator;
1522
+ }
1523
+
1435
1524
static UStringSearch* createSearcher ()
1436
1525
{
1437
1526
// Provide a non-empty pattern and non-empty text so usearch_open will not fail,
@@ -1440,6 +1529,10 @@ static UStringSearch* createSearcher()
1440
1529
UErrorCode status = U_ZERO_ERROR;
1441
1530
UStringSearch* searcher = usearch_open (&newlineCharacter, 1 , &newlineCharacter, 1 , currentSearchLocaleID (), 0 , &status);
1442
1531
ASSERT (status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1532
+ status = U_ZERO_ERROR;
1533
+ usearch_setCollator (searcher, collator (), &status);
1534
+ ASSERT (status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1535
+ usearch_reset (searcher);
1443
1536
return searcher;
1444
1537
}
1445
1538
0 commit comments