Skip to content

Commit 23ca191

Browse files
author
Matthew Barnett
committed
Git issue 575: Issues with ASCII/Unicode modifiers
1 parent 88fee85 commit 23ca191

File tree

9 files changed

+664
-789
lines changed

9 files changed

+664
-789
lines changed

changelog.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
Version: 2025.7.34
2+
3+
Git issue 575: Issues with ASCII/Unicode modifiers
4+
15
Version: 2025.7.33
26

37
Updated main.yml and pyproject.toml.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "regex"
7-
version = "2025.7.33"
7+
version = "2025.7.34"
88
description = "Alternative regular expression module, to replace re."
99
readme = "README.rst"
1010
authors = [

regex_3/_regex.c

Lines changed: 126 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@
5858
typedef RE_UINT32 RE_CODE;
5959
typedef unsigned char BYTE;
6060

61+
/* An unassigned codepoint. */
62+
#define UNASSIGNED_CODEPOINT 0x10FFFF
63+
6164
/* Properties in the General Category. */
6265
#define RE_PROP_GC_CN ((RE_PROP_GC << 16) | RE_PROP_CN)
6366
#define RE_PROP_GC_LU ((RE_PROP_GC << 16) | RE_PROP_LU)
@@ -157,6 +160,11 @@ typedef RE_UINT32 RE_STATUS_T;
157160
/* Various flags stored in a node status member. */
158161
#define RE_STATUS_SHIFT 11
159162

163+
#define RE_ENCODING_SHIFT 16
164+
#define ASCII_ENCODING 1
165+
#define UNICODE_ENCODING 2
166+
#define ENCODING_KIND(NODE) (((NODE)->status >> RE_ENCODING_SHIFT) & 0x3)
167+
160168
#define RE_STATUS_FUZZY (RE_FUZZY_OP << RE_STATUS_SHIFT)
161169
#define RE_STATUS_REVERSE (RE_REVERSE_OP << RE_STATUS_SHIFT)
162170
#define RE_STATUS_REQUIRED (RE_REQUIRED_OP << RE_STATUS_SHIFT)
@@ -809,12 +817,8 @@ Py_LOCAL_INLINE(BOOL) unicode_has_property(RE_CODE property, Py_UCS4 ch);
809817
/* Checks whether a character has a property. */
810818
Py_LOCAL_INLINE(BOOL) ascii_has_property(RE_CODE property, Py_UCS4 ch) {
811819
if (ch > RE_ASCII_MAX) {
812-
/* Outside the ASCII range. */
813-
RE_UINT32 value;
814-
815-
value = property & 0xFFFF;
816-
817-
return value == 0;
820+
/* Treat it as an unassigned codepoint. */
821+
ch = UNASSIGNED_CODEPOINT;
818822
}
819823

820824
return unicode_has_property(property, ch);
@@ -824,19 +828,12 @@ Py_LOCAL_INLINE(BOOL) ascii_has_property(RE_CODE property, Py_UCS4 ch) {
824828
Py_LOCAL_INLINE(BOOL) ascii_has_property_ign(RE_CODE property, Py_UCS4 ch) {
825829
RE_UINT32 prop;
826830

827-
prop = property >> 16;
828-
829-
/* We are working with ASCII. */
830-
if (property == RE_PROP_GC_LU || property == RE_PROP_GC_LL || property ==
831-
RE_PROP_GC_LT) {
832-
RE_UINT32 value;
833-
834-
value = re_get_general_category(ch);
831+
if (ch > RE_ASCII_MAX) {
832+
/* Treat it as an unassigned codepoint. */
833+
ch = UNASSIGNED_CODEPOINT;
834+
}
835835

836-
return value == RE_PROP_LU || value == RE_PROP_LL || value ==
837-
RE_PROP_LT;
838-
} else if (prop == RE_PROP_UPPERCASE || prop == RE_PROP_LOWERCASE)
839-
return (BOOL)re_get_cased(ch);
836+
prop = property >> 16;
840837

841838
/* The property is case-insensitive. */
842839
return ascii_has_property(property, ch);
@@ -2902,7 +2899,14 @@ Py_LOCAL_INLINE(BOOL) matches_CHARACTER_IGN(RE_EncodingTable* encoding,
29022899
/* Checks whether a character has a property. */
29032900
Py_LOCAL_INLINE(BOOL) matches_PROPERTY(RE_EncodingTable* encoding,
29042901
RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) {
2905-
return encoding->has_property(locale_info, node->values[0], ch);
2902+
switch (ENCODING_KIND(node)) {
2903+
case ASCII_ENCODING:
2904+
return ascii_encoding.has_property(locale_info, node->values[0], ch);
2905+
case UNICODE_ENCODING:
2906+
return unicode_encoding.has_property(locale_info, node->values[0], ch);
2907+
default:
2908+
return encoding->has_property(locale_info, node->values[0], ch);
2909+
}
29062910
}
29072911

29082912
/* Checks whether a character has a property, ignoring case. */
@@ -2914,6 +2918,15 @@ Py_LOCAL_INLINE(BOOL) matches_PROPERTY_IGN(RE_EncodingTable* encoding,
29142918
property = node->values[0];
29152919
prop = property >> 16;
29162920

2921+
switch (ENCODING_KIND(node)) {
2922+
case ASCII_ENCODING:
2923+
encoding = &ascii_encoding;
2924+
break;
2925+
case UNICODE_ENCODING:
2926+
encoding = &unicode_encoding;
2927+
break;
2928+
}
2929+
29172930
/* We need to do special handling of case-sensitive properties according to
29182931
* the 'encoding'.
29192932
*/
@@ -3000,7 +3013,15 @@ Py_LOCAL_INLINE(BOOL) matches_member(RE_EncodingTable* encoding, RE_LocaleInfo*
30003013
/* values are: property */
30013014
TRACE(("%s %d %d\n", re_op_text[member->op], member->match,
30023015
member->values[0]))
3003-
return encoding->has_property(locale_info, member->values[0], ch);
3016+
3017+
switch (ENCODING_KIND(member)) {
3018+
case ASCII_ENCODING:
3019+
return ascii_encoding.has_property(locale_info, member->values[0], ch);
3020+
case UNICODE_ENCODING:
3021+
return unicode_encoding.has_property(locale_info, member->values[0], ch);
3022+
default:
3023+
return encoding->has_property(locale_info, member->values[0], ch);
3024+
}
30043025
case RE_OP_RANGE:
30053026
/* values are: lower, upper */
30063027
TRACE(("%s %d %d %d\n", re_op_text[member->op], member->match,
@@ -4006,7 +4027,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY(RE_State* state, RE_Node* node,
40064027

40074028
text = state->text;
40084029
match = node->match == match;
4009-
encoding = state->encoding;
4030+
4031+
switch (ENCODING_KIND(node)) {
4032+
case ASCII_ENCODING:
4033+
encoding = &ascii_encoding;
4034+
break;
4035+
case UNICODE_ENCODING:
4036+
encoding = &unicode_encoding;
4037+
break;
4038+
default:
4039+
encoding = state->encoding;
4040+
break;
4041+
}
4042+
40104043
locale_info = state->locale_info;
40114044
property = node->values[0];
40124045

@@ -4104,7 +4137,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_IGN(RE_State* state, RE_Node*
41044137

41054138
text = state->text;
41064139
match = node->match == match;
4107-
encoding = state->encoding;
4140+
4141+
switch (ENCODING_KIND(node)) {
4142+
case ASCII_ENCODING:
4143+
encoding = &ascii_encoding;
4144+
break;
4145+
case UNICODE_ENCODING:
4146+
encoding = &unicode_encoding;
4147+
break;
4148+
default:
4149+
encoding = state->encoding;
4150+
break;
4151+
}
4152+
41084153
locale_info = state->locale_info;
41094154
property = node->values[0];
41104155

@@ -4202,7 +4247,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_IGN_REV(RE_State* state,
42024247

42034248
text = state->text;
42044249
match = node->match == match;
4205-
encoding = state->encoding;
4250+
4251+
switch (ENCODING_KIND(node)) {
4252+
case ASCII_ENCODING:
4253+
encoding = &ascii_encoding;
4254+
break;
4255+
case UNICODE_ENCODING:
4256+
encoding = &unicode_encoding;
4257+
break;
4258+
default:
4259+
encoding = state->encoding;
4260+
break;
4261+
}
4262+
42064263
locale_info = state->locale_info;
42074264
property = node->values[0];
42084265

@@ -4300,7 +4357,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_REV(RE_State* state, RE_Node*
43004357

43014358
text = state->text;
43024359
match = node->match == match;
4303-
encoding = state->encoding;
4360+
4361+
switch (ENCODING_KIND(node)) {
4362+
case ASCII_ENCODING:
4363+
encoding = &ascii_encoding;
4364+
break;
4365+
case UNICODE_ENCODING:
4366+
encoding = &unicode_encoding;
4367+
break;
4368+
default:
4369+
encoding = state->encoding;
4370+
break;
4371+
}
4372+
43044373
locale_info = state->locale_info;
43054374
property = node->values[0];
43064375

@@ -6882,8 +6951,17 @@ Py_LOCAL_INLINE(int) try_match_ANY_U_REV(RE_State* state, RE_Node* node,
68826951
/* Checks whether a position is on a word boundary. */
68836952
Py_LOCAL_INLINE(int) try_match_BOUNDARY(RE_State* state, RE_Node* node,
68846953
Py_ssize_t text_pos) {
6885-
return bool_as_status(state->encoding->at_boundary(state, text_pos) ==
6886-
node->match);
6954+
switch (ENCODING_KIND(node)) {
6955+
case ASCII_ENCODING:
6956+
return bool_as_status(ascii_encoding.at_boundary(state, text_pos) ==
6957+
node->match);
6958+
case UNICODE_ENCODING:
6959+
return bool_as_status(unicode_encoding.at_boundary(state, text_pos) ==
6960+
node->match);
6961+
default:
6962+
return bool_as_status(state->encoding->at_boundary(state, text_pos) ==
6963+
node->match);
6964+
}
68876965
}
68886966

68896967
/* Checks whether there's a character at a position. */
@@ -7724,7 +7802,17 @@ Py_LOCAL_INLINE(Py_ssize_t) search_start_BOUNDARY(RE_State* state, RE_Node*
77247802
node, Py_ssize_t text_pos, BOOL* is_partial) {
77257803
BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos);
77267804

7727-
at_boundary = state->encoding->at_boundary;
7805+
switch (ENCODING_KIND(node)) {
7806+
case ASCII_ENCODING:
7807+
at_boundary = ascii_encoding.at_boundary;
7808+
break;
7809+
case UNICODE_ENCODING:
7810+
at_boundary = unicode_encoding.at_boundary;
7811+
break;
7812+
default:
7813+
at_boundary = state->encoding->at_boundary;
7814+
break;
7815+
}
77287816

77297817
*is_partial = FALSE;
77307818

@@ -7744,7 +7832,17 @@ Py_LOCAL_INLINE(Py_ssize_t) search_start_BOUNDARY_rev(RE_State* state, RE_Node*
77447832
node, Py_ssize_t text_pos, BOOL* is_partial) {
77457833
BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos);
77467834

7747-
at_boundary = state->encoding->at_boundary;
7835+
switch (ENCODING_KIND(node)) {
7836+
case ASCII_ENCODING:
7837+
at_boundary = ascii_encoding.at_boundary;
7838+
break;
7839+
case UNICODE_ENCODING:
7840+
at_boundary = unicode_encoding.at_boundary;
7841+
break;
7842+
default:
7843+
at_boundary = state->encoding->at_boundary;
7844+
break;
7845+
}
77487846

77497847
*is_partial = FALSE;
77507848

0 commit comments

Comments
 (0)