58
58
typedef RE_UINT32 RE_CODE;
59
59
typedef unsigned char BYTE;
60
60
61
+ /* An unassigned codepoint. */
62
+ #define UNASSIGNED_CODEPOINT 0x10FFFF
63
+
61
64
/* Properties in the General Category. */
62
65
#define RE_PROP_GC_CN ((RE_PROP_GC << 16) | RE_PROP_CN)
63
66
#define RE_PROP_GC_LU ((RE_PROP_GC << 16) | RE_PROP_LU)
@@ -157,6 +160,11 @@ typedef RE_UINT32 RE_STATUS_T;
157
160
/* Various flags stored in a node status member. */
158
161
#define RE_STATUS_SHIFT 11
159
162
163
+ #define RE_ENCODING_SHIFT 16
164
+ #define ASCII_ENCODING 1
165
+ #define UNICODE_ENCODING 2
166
+ #define ENCODING_KIND(NODE) (((NODE)->status >> RE_ENCODING_SHIFT) & 0x3)
167
+
160
168
#define RE_STATUS_FUZZY (RE_FUZZY_OP << RE_STATUS_SHIFT)
161
169
#define RE_STATUS_REVERSE (RE_REVERSE_OP << RE_STATUS_SHIFT)
162
170
#define RE_STATUS_REQUIRED (RE_REQUIRED_OP << RE_STATUS_SHIFT)
@@ -809,12 +817,8 @@ Py_LOCAL_INLINE(BOOL) unicode_has_property(RE_CODE property, Py_UCS4 ch);
809
817
/* Checks whether a character has a property. */
810
818
Py_LOCAL_INLINE(BOOL) ascii_has_property(RE_CODE property, Py_UCS4 ch) {
811
819
if (ch > RE_ASCII_MAX) {
812
- /* Outside the ASCII range. */
813
- RE_UINT32 value;
814
-
815
- value = property & 0xFFFF;
816
-
817
- return value == 0;
820
+ /* Treat it as an unassigned codepoint. */
821
+ ch = UNASSIGNED_CODEPOINT;
818
822
}
819
823
820
824
return unicode_has_property(property, ch);
@@ -824,19 +828,12 @@ Py_LOCAL_INLINE(BOOL) ascii_has_property(RE_CODE property, Py_UCS4 ch) {
824
828
Py_LOCAL_INLINE(BOOL) ascii_has_property_ign(RE_CODE property, Py_UCS4 ch) {
825
829
RE_UINT32 prop;
826
830
827
- prop = property >> 16;
828
-
829
- /* We are working with ASCII. */
830
- if (property == RE_PROP_GC_LU || property == RE_PROP_GC_LL || property ==
831
- RE_PROP_GC_LT) {
832
- RE_UINT32 value;
833
-
834
- value = re_get_general_category(ch);
831
+ if (ch > RE_ASCII_MAX) {
832
+ /* Treat it as an unassigned codepoint. */
833
+ ch = UNASSIGNED_CODEPOINT;
834
+ }
835
835
836
- return value == RE_PROP_LU || value == RE_PROP_LL || value ==
837
- RE_PROP_LT;
838
- } else if (prop == RE_PROP_UPPERCASE || prop == RE_PROP_LOWERCASE)
839
- return (BOOL)re_get_cased(ch);
836
+ prop = property >> 16;
840
837
841
838
/* The property is case-insensitive. */
842
839
return ascii_has_property(property, ch);
@@ -2902,7 +2899,14 @@ Py_LOCAL_INLINE(BOOL) matches_CHARACTER_IGN(RE_EncodingTable* encoding,
2902
2899
/* Checks whether a character has a property. */
2903
2900
Py_LOCAL_INLINE(BOOL) matches_PROPERTY(RE_EncodingTable* encoding,
2904
2901
RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) {
2905
- return encoding->has_property(locale_info, node->values[0], ch);
2902
+ switch (ENCODING_KIND(node)) {
2903
+ case ASCII_ENCODING:
2904
+ return ascii_encoding.has_property(locale_info, node->values[0], ch);
2905
+ case UNICODE_ENCODING:
2906
+ return unicode_encoding.has_property(locale_info, node->values[0], ch);
2907
+ default:
2908
+ return encoding->has_property(locale_info, node->values[0], ch);
2909
+ }
2906
2910
}
2907
2911
2908
2912
/* Checks whether a character has a property, ignoring case. */
@@ -2914,6 +2918,15 @@ Py_LOCAL_INLINE(BOOL) matches_PROPERTY_IGN(RE_EncodingTable* encoding,
2914
2918
property = node->values[0];
2915
2919
prop = property >> 16;
2916
2920
2921
+ switch (ENCODING_KIND(node)) {
2922
+ case ASCII_ENCODING:
2923
+ encoding = &ascii_encoding;
2924
+ break;
2925
+ case UNICODE_ENCODING:
2926
+ encoding = &unicode_encoding;
2927
+ break;
2928
+ }
2929
+
2917
2930
/* We need to do special handling of case-sensitive properties according to
2918
2931
* the 'encoding'.
2919
2932
*/
@@ -3000,7 +3013,15 @@ Py_LOCAL_INLINE(BOOL) matches_member(RE_EncodingTable* encoding, RE_LocaleInfo*
3000
3013
/* values are: property */
3001
3014
TRACE(("%s %d %d\n", re_op_text[member->op], member->match,
3002
3015
member->values[0]))
3003
- return encoding->has_property(locale_info, member->values[0], ch);
3016
+
3017
+ switch (ENCODING_KIND(member)) {
3018
+ case ASCII_ENCODING:
3019
+ return ascii_encoding.has_property(locale_info, member->values[0], ch);
3020
+ case UNICODE_ENCODING:
3021
+ return unicode_encoding.has_property(locale_info, member->values[0], ch);
3022
+ default:
3023
+ return encoding->has_property(locale_info, member->values[0], ch);
3024
+ }
3004
3025
case RE_OP_RANGE:
3005
3026
/* values are: lower, upper */
3006
3027
TRACE(("%s %d %d %d\n", re_op_text[member->op], member->match,
@@ -4006,7 +4027,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY(RE_State* state, RE_Node* node,
4006
4027
4007
4028
text = state->text;
4008
4029
match = node->match == match;
4009
- encoding = state->encoding;
4030
+
4031
+ switch (ENCODING_KIND(node)) {
4032
+ case ASCII_ENCODING:
4033
+ encoding = &ascii_encoding;
4034
+ break;
4035
+ case UNICODE_ENCODING:
4036
+ encoding = &unicode_encoding;
4037
+ break;
4038
+ default:
4039
+ encoding = state->encoding;
4040
+ break;
4041
+ }
4042
+
4010
4043
locale_info = state->locale_info;
4011
4044
property = node->values[0];
4012
4045
@@ -4104,7 +4137,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_IGN(RE_State* state, RE_Node*
4104
4137
4105
4138
text = state->text;
4106
4139
match = node->match == match;
4107
- encoding = state->encoding;
4140
+
4141
+ switch (ENCODING_KIND(node)) {
4142
+ case ASCII_ENCODING:
4143
+ encoding = &ascii_encoding;
4144
+ break;
4145
+ case UNICODE_ENCODING:
4146
+ encoding = &unicode_encoding;
4147
+ break;
4148
+ default:
4149
+ encoding = state->encoding;
4150
+ break;
4151
+ }
4152
+
4108
4153
locale_info = state->locale_info;
4109
4154
property = node->values[0];
4110
4155
@@ -4202,7 +4247,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_IGN_REV(RE_State* state,
4202
4247
4203
4248
text = state->text;
4204
4249
match = node->match == match;
4205
- encoding = state->encoding;
4250
+
4251
+ switch (ENCODING_KIND(node)) {
4252
+ case ASCII_ENCODING:
4253
+ encoding = &ascii_encoding;
4254
+ break;
4255
+ case UNICODE_ENCODING:
4256
+ encoding = &unicode_encoding;
4257
+ break;
4258
+ default:
4259
+ encoding = state->encoding;
4260
+ break;
4261
+ }
4262
+
4206
4263
locale_info = state->locale_info;
4207
4264
property = node->values[0];
4208
4265
@@ -4300,7 +4357,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_REV(RE_State* state, RE_Node*
4300
4357
4301
4358
text = state->text;
4302
4359
match = node->match == match;
4303
- encoding = state->encoding;
4360
+
4361
+ switch (ENCODING_KIND(node)) {
4362
+ case ASCII_ENCODING:
4363
+ encoding = &ascii_encoding;
4364
+ break;
4365
+ case UNICODE_ENCODING:
4366
+ encoding = &unicode_encoding;
4367
+ break;
4368
+ default:
4369
+ encoding = state->encoding;
4370
+ break;
4371
+ }
4372
+
4304
4373
locale_info = state->locale_info;
4305
4374
property = node->values[0];
4306
4375
@@ -6882,8 +6951,17 @@ Py_LOCAL_INLINE(int) try_match_ANY_U_REV(RE_State* state, RE_Node* node,
6882
6951
/* Checks whether a position is on a word boundary. */
6883
6952
Py_LOCAL_INLINE(int) try_match_BOUNDARY(RE_State* state, RE_Node* node,
6884
6953
Py_ssize_t text_pos) {
6885
- return bool_as_status(state->encoding->at_boundary(state, text_pos) ==
6886
- node->match);
6954
+ switch (ENCODING_KIND(node)) {
6955
+ case ASCII_ENCODING:
6956
+ return bool_as_status(ascii_encoding.at_boundary(state, text_pos) ==
6957
+ node->match);
6958
+ case UNICODE_ENCODING:
6959
+ return bool_as_status(unicode_encoding.at_boundary(state, text_pos) ==
6960
+ node->match);
6961
+ default:
6962
+ return bool_as_status(state->encoding->at_boundary(state, text_pos) ==
6963
+ node->match);
6964
+ }
6887
6965
}
6888
6966
6889
6967
/* Checks whether there's a character at a position. */
@@ -7724,7 +7802,17 @@ Py_LOCAL_INLINE(Py_ssize_t) search_start_BOUNDARY(RE_State* state, RE_Node*
7724
7802
node, Py_ssize_t text_pos, BOOL* is_partial) {
7725
7803
BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos);
7726
7804
7727
- at_boundary = state->encoding->at_boundary;
7805
+ switch (ENCODING_KIND(node)) {
7806
+ case ASCII_ENCODING:
7807
+ at_boundary = ascii_encoding.at_boundary;
7808
+ break;
7809
+ case UNICODE_ENCODING:
7810
+ at_boundary = unicode_encoding.at_boundary;
7811
+ break;
7812
+ default:
7813
+ at_boundary = state->encoding->at_boundary;
7814
+ break;
7815
+ }
7728
7816
7729
7817
*is_partial = FALSE;
7730
7818
@@ -7744,7 +7832,17 @@ Py_LOCAL_INLINE(Py_ssize_t) search_start_BOUNDARY_rev(RE_State* state, RE_Node*
7744
7832
node, Py_ssize_t text_pos, BOOL* is_partial) {
7745
7833
BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos);
7746
7834
7747
- at_boundary = state->encoding->at_boundary;
7835
+ switch (ENCODING_KIND(node)) {
7836
+ case ASCII_ENCODING:
7837
+ at_boundary = ascii_encoding.at_boundary;
7838
+ break;
7839
+ case UNICODE_ENCODING:
7840
+ at_boundary = unicode_encoding.at_boundary;
7841
+ break;
7842
+ default:
7843
+ at_boundary = state->encoding->at_boundary;
7844
+ break;
7845
+ }
7748
7846
7749
7847
*is_partial = FALSE;
7750
7848
0 commit comments