123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567 |
- This is a dump from Google's source control system of the change
- that removed UCS-2 support from RE2. As the explanation below
- says, UCS-2 mode is fundamentally at odds with things like ^ and $,
- so it never really worked very well. But if you are interested in using
- it without those operators, it did work for that. It assumed that the
- UCS-2 data was in the native host byte order.
- If you are interested in adding UCS-2 mode back, this patch might
- be a good starting point.
- Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15
- Retire UCS-2 mode.
-
- I added it as an experiment for V8, but it
- requires 2-byte lookahead to do completely,
- and RE2 has 1-byte lookahead (enough for UTF-8)
- as a fairly deep fundamental assumption,
- so it did not support ^ or $.
- ==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
- re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
- cap_[0] = p;
- if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
- return true;
- - if (prog_->flags() & Regexp::UCS2)
- - p++;
- }
- return false;
- }
- ==== re2/compile.cc#17 - re2/compile.cc#18 ====
- re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
- // Input encodings.
- enum Encoding {
- kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
- - kEncodingUCS2, // UCS-2 (0-FFFF), native byte order
- kEncodingLatin1, // Latin1 (0-FF)
- };
-
- re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
- void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
- void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
- void Add_80_10ffff();
- - void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
- - void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
- - uint8 lo2, uint8 hi2, bool fold2);
-
- // New suffix that matches the byte range lo-hi, then goes to next.
- Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
- re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477
-
- // Converts rune range lo-hi into a fragment that recognizes
- // the bytes that would make up those runes in the current
- - // encoding (Latin 1, UTF-8, or UCS-2).
- + // encoding (Latin 1 or UTF-8).
- // This lets the machine work byte-by-byte even when
- // using multibyte encodings.
-
- re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
- case kEncodingLatin1:
- AddRuneRangeLatin1(lo, hi, foldcase);
- break;
- - case kEncodingUCS2:
- - AddRuneRangeUCS2(lo, hi, foldcase);
- - break;
- }
- }
-
- re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
- AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
- }
-
- - // Test whether 16-bit values are big or little endian.
- - static bool BigEndian() {
- - union {
- - char byte[2];
- - int16 endian;
- - } u;
- -
- - u.byte[0] = 1;
- - u.byte[1] = 2;
- - return u.endian == 0x0102;
- - }
- -
- - void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
- - uint8 lo2, uint8 hi2, bool fold2) {
- - Inst* ip;
- - if (reversed_) {
- - ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
- - ip = RuneByteSuffix(lo2, hi2, fold2, ip);
- - } else {
- - ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
- - ip = RuneByteSuffix(lo1, hi1, fold1, ip);
- - }
- - AddSuffix(ip);
- - }
- -
- - void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
- - if (lo > hi || lo > 0xFFFF)
- - return;
- - if (hi > 0xFFFF)
- - hi = 0xFFFF;
- -
- - // We'll assemble a pattern assuming big endian.
- - // If the machine isn't, tell Cat to reverse its arguments.
- - bool oldreversed = reversed_;
- - if (!BigEndian()) {
- - reversed_ = !oldreversed;
- - }
- -
- - // Split into bytes.
- - int lo1 = lo >> 8;
- - int lo2 = lo & 0xFF;
- - int hi1 = hi >> 8;
- - int hi2 = hi & 0xFF;
- -
- - if (lo1 == hi1) {
- - // Easy case: high bits are same in both.
- - // Only do ASCII case folding on the second byte if the top byte is 00.
- - AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
- - } else {
- - // Harder case: different second byte ranges depending on first byte.
- -
- - // Initial fragment.
- - if (lo2 > 0) {
- - AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
- - lo1++;
- - }
- -
- - // Trailing fragment.
- - if (hi2 < 0xFF) {
- - AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
- - hi1--;
- - }
- -
- - // Inner ranges.
- - if (lo1 <= hi1) {
- - AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
- - }
- - }
- -
- - // Restore reverse setting.
- - reversed_ = oldreversed;
- - }
- -
- // Table describing how to make a UTF-8 matching machine
- // for the rune range 80-10FFFF (Runeself-Runemax).
- // This range happens frequently enough (for example /./ and /[^a-z]/)
- re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634
-
- Frag Compiler::Literal(Rune r, bool foldcase) {
- switch (encoding_) {
- - default: // UCS-2 or something new
- - BeginRange();
- - AddRuneRange(r, r, foldcase);
- - return EndRange();
- + default:
- + return kNullFrag;
-
- case kEncodingLatin1:
- return ByteRange(r, r, foldcase);
- re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850
-
- if (re->parse_flags() & Regexp::Latin1)
- c.encoding_ = kEncodingLatin1;
- - else if (re->parse_flags() & Regexp::UCS2)
- - c.encoding_ = kEncodingUCS2;
- c.reversed_ = reversed;
- if (max_mem <= 0) {
- c.max_inst_ = 100000; // more than enough
- re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
- c.prog_->set_start_unanchored(c.prog_->start());
- } else {
- Frag dot;
- - if (c.encoding_ == kEncodingUCS2) {
- - dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
- - } else {
- - dot = c.ByteRange(0x00, 0xFF, false);
- - }
- + dot = c.ByteRange(0x00, 0xFF, false);
- Frag dotloop = c.Star(dot, true);
- Frag unanchored = c.Cat(dotloop, all);
- c.prog_->set_start_unanchored(unanchored.begin);
- ==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
- re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
- const char* bp = context.begin();
- int c = -1;
- int wasword = 0;
- - bool ucs2 = prog_->flags() & Regexp::UCS2;
-
- if (text.begin() > context.begin()) {
- c = text.begin()[-1] & 0xFF;
- re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
- // If there's a required first byte for an unanchored search
- // and we're not in the middle of any possible matches,
- // use memchr to search for the byte quickly.
- - if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
- + if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
- p < text.end() && (p[0] & 0xFF) != first_byte_) {
- p = reinterpret_cast<const char*>(memchr(p, first_byte_,
- text.end() - p));
- re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
- flag = Prog::EmptyFlags(context, p);
- }
-
- - // In UCS-2 mode, if we need to start a new thread,
- - // make sure to do it on an even boundary.
- - if(ucs2 && runq->size() == 0 &&
- - (p - context.begin()) % 2 && p < text.end()) {
- - p++;
- - flag = Prog::EmptyFlags(context, p);
- - }
- -
- // Steal match storage (cleared but unused as of yet)
- // temporarily to hold match boundaries for new thread.
- - // In UCS-2 mode, only start the thread on a 2-byte boundary.
- - if(!ucs2 || (p - context.begin()) % 2 == 0) {
- - match_[0] = p;
- - AddToThreadq(runq, start_, flag, p, match_);
- - match_[0] = NULL;
- - }
- + match_[0] = p;
- + AddToThreadq(runq, start_, flag, p, match_);
- + match_[0] = NULL;
- }
-
- // If all the threads have died, stop early.
- ==== re2/parse.cc#22 - re2/parse.cc#23 ====
- re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
- status_(status), stacktop_(NULL), ncap_(0) {
- if (flags_ & Latin1)
- rune_max_ = 0xFF;
- - else if (flags & UCS2)
- - rune_max_ = 0xFFFF;
- else
- rune_max_ = Runemax;
- }
- re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
- bool Regexp::ParseState::PushCarat() {
- if (flags_ & OneLine) {
- return PushSimpleOp(kRegexpBeginText);
- - } else {
- - if (flags_ & UCS2) {
- - status_->set_code(kRegexpUnsupported);
- - status_->set_error_arg("multiline ^ in UCS-2 mode");
- - return false;
- - }
- - return PushSimpleOp(kRegexpBeginLine);
- }
- + return PushSimpleOp(kRegexpBeginLine);
- }
-
- // Pushes a \b or \B onto the stack.
- bool Regexp::ParseState::PushWordBoundary(bool word) {
- - if (flags_ & UCS2) {
- - status_->set_code(kRegexpUnsupported);
- - status_->set_error_arg("\\b or \\B in UCS-2 mode");
- - return false;
- - }
- if (word)
- return PushSimpleOp(kRegexpWordBoundary);
- return PushSimpleOp(kRegexpNoWordBoundary);
- re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
- bool ret = PushSimpleOp(kRegexpEndText);
- flags_ = oflags;
- return ret;
- - }
- - if (flags_ & UCS2) {
- - status_->set_code(kRegexpUnsupported);
- - status_->set_error_arg("multiline $ in UCS-2 mode");
- - return false;
- }
- return PushSimpleOp(kRegexpEndLine);
- }
- ==== re2/re2.cc#34 - re2/re2.cc#35 ====
- re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
- return RE2::ErrorBadUTF8;
- case re2::kRegexpBadNamedCapture:
- return RE2::ErrorBadNamedCapture;
- - case re2::kRegexpUnsupported:
- - return RE2::ErrorUnsupported;
- }
- return RE2::ErrorInternal;
- }
- re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
- break;
- case RE2::Options::EncodingLatin1:
- flags |= Regexp::Latin1;
- - break;
- - case RE2::Options::EncodingUCS2:
- - flags |= Regexp::UCS2;
- break;
- }
-
- ==== re2/re2.h#36 - re2/re2.h#37 ====
- re2/re2.h#36:246,252 - re2/re2.h#37:246,251
- ErrorBadUTF8, // invalid UTF-8 in regexp
- ErrorBadNamedCapture, // bad named capture group
- ErrorPatternTooLarge, // pattern too large (compile failed)
- - ErrorUnsupported, // unsupported feature (in UCS-2 mode)
- };
-
- // Predefined common options.
- re2/re2.h#36:570,576 - re2/re2.h#37:569,574
-
- enum Encoding {
- EncodingUTF8 = 1,
- - EncodingUCS2, // 16-bit Unicode 0-FFFF only
- EncodingLatin1
- };
-
- ==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
- re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
- // the regexp that remains after the prefix. The prefix might
- // be ASCII case-insensitive.
- bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
- - // Don't even bother for UCS-2; it's time to throw that code away.
- - if (parse_flags_ & UCS2)
- - return false;
- -
- // No need for a walker: the regexp must be of the form
- // 1. some number of ^ anchors
- // 2. a literal char or string
- ==== re2/regexp.h#20 - re2/regexp.h#21 ====
- re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
- kRegexpBadPerlOp, // bad perl operator
- kRegexpBadUTF8, // invalid UTF-8 in regexp
- kRegexpBadNamedCapture, // bad named capture
- - kRegexpUnsupported, // unsupported operator
- };
-
- // Error status for certain operations.
- re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
- // \Q and \E to disable/enable metacharacters
- // (?P<name>expr) for named captures
- // \C to match any single byte
- - UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8.
- - UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
- + UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
- // and \P{Han} for its negation.
- - NeverNL = 1<<12, // Never match NL, even if the regexp mentions
- + NeverNL = 1<<11, // Never match NL, even if the regexp mentions
- // it explicitly.
-
- // As close to Perl as we can get.
- ==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
- re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
- cap_[0] = p;
- if (Visit(prog_->start(), p)) // Match must be leftmost; done.
- return true;
- - if (prog_->flags() & Regexp::UCS2)
- - p++;
- }
- return false;
- }
- ==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
- re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
- static ParseMode parse_modes[] = {
- { single_line, "single-line" },
- { single_line|Regexp::Latin1, "single-line, latin1" },
- - { single_line|Regexp::UCS2, "single-line, ucs2" },
- { multi_line, "multiline" },
- { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
- { multi_line|Regexp::Latin1, "multiline, latin1" },
- - { multi_line|Regexp::UCS2, "multiline, ucs2" },
- };
-
- static string FormatMode(Regexp::ParseFlags flags) {
- re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
- RegexpStatus status;
- regexp_ = Regexp::Parse(regexp_str, flags, &status);
- if (regexp_ == NULL) {
- - if (status.code() != kRegexpUnsupported) {
- - LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
- - << " mode: " << FormatMode(flags);
- - error_ = true;
- - }
- + LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
- + << " mode: " << FormatMode(flags);
- + error_ = true;
- return;
- }
- prog_ = regexp_->CompileToProg(0);
- re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
- RE2::Options options;
- if (flags & Regexp::Latin1)
- options.set_encoding(RE2::Options::EncodingLatin1);
- - else if (flags & Regexp::UCS2)
- - options.set_encoding(RE2::Options::EncodingUCS2);
- if (kind_ == Prog::kLongestMatch)
- options.set_longest_match(true);
- re2_ = new RE2(re, options);
- re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
- delete re2_;
- }
-
- - // Converts UTF-8 string in text into UCS-2 string in new_text.
- - static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
- - const char* p = text.begin();
- - const char* ep = text.end();
- - uint16* q = new uint16[ep - p];
- - uint16* q0 = q;
- -
- - int n;
- - Rune r;
- - for (; p < ep; p += n) {
- - if (!fullrune(p, ep - p)) {
- - delete[] q0;
- - return false;
- - }
- - n = chartorune(&r, p);
- - if (r > 0xFFFF) {
- - delete[] q0;
- - return false;
- - }
- - *q++ = r;
- - }
- - *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
- - return true;
- - }
- -
- - // Rewrites *sp from being a pointer into text8 (UTF-8)
- - // to being a pointer into text16 (equivalent text but in UCS-2).
- - static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
- - StringPiece *sp) {
- - if (sp->begin() == NULL && text8.begin() != NULL)
- - return;
- -
- - int nrune = 0;
- - int n;
- - Rune r;
- - const char* p = text8.begin();
- - const char* ep = text8.end();
- - const char* spbegin = NULL;
- - const char* spend = NULL;
- - for (;;) {
- - if (p == sp->begin())
- - spbegin = text16.begin() + sizeof(uint16)*nrune;
- - if (p == sp->end())
- - spend = text16.begin() + sizeof(uint16)*nrune;
- - if (p >= ep)
- - break;
- - n = chartorune(&r, p);
- - p += n;
- - nrune++;
- - }
- - if (spbegin == NULL || spend == NULL) {
- - LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
- - << CEscape(text8) << " "
- - << (int)(sp->begin() - text8.begin()) << " "
- - << (int)(sp->end() - text8.begin());
- - }
- - *sp = StringPiece(spbegin, spend - spbegin);
- - }
- -
- - // Rewrites *sp from begin a pointer into text16 (UCS-2)
- - // to being a pointer into text8 (equivalent text but in UTF-8).
- - static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
- - StringPiece* sp) {
- - if (sp->begin() == NULL)
- - return;
- -
- - int nrune = 0;
- - int n;
- - Rune r;
- - const char* p = text8.begin();
- - const char* ep = text8.end();
- - const char* spbegin = NULL;
- - const char* spend = NULL;
- - for (;;) {
- - if (nrune == (sp->begin() - text16.begin())/2)
- - spbegin = p;
- - if (nrune == (sp->end() - text16.begin())/2)
- - spend = p;
- - if (p >= ep)
- - break;
- - n = chartorune(&r, p);
- - p += n;
- - nrune++;
- - }
- - if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
- - LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
- - << CEscape(text16) << " "
- - << (int)(sp->begin() - text16.begin()) << " "
- - << (int)(sp->end() - text16.begin());
- - }
- - *sp = StringPiece(spbegin, spend - spbegin);
- - }
- -
- // Runs a single search using the named engine type.
- // This interface hides all the irregularities of the various
- // engine interfaces from the rest of this file.
- re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300
-
- StringPiece text = orig_text;
- StringPiece context = orig_context;
- - bool ucs2 = false;
-
- - if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
- - if (!ConvertUTF8ToUCS2(orig_context, &context)) {
- - result->skipped = true;
- - return;
- - }
- -
- - // Rewrite context to refer to new text.
- - AdjustUTF8ToUCS2(orig_context, context, &text);
- - ucs2 = true;
- - }
- -
- switch (type) {
- default:
- LOG(FATAL) << "Bad RunSearch type: " << (int)type;
- re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
- }
- }
-
- - // If we did UCS-2 matching, rewrite the matches to refer
- - // to the original UTF-8 text.
- - if (ucs2) {
- - if (result->matched) {
- - if (result->have_submatch0) {
- - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
- - } else if (result->have_submatch) {
- - for (int i = 0; i < nsubmatch; i++) {
- - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
- - }
- - }
- - }
- - delete[] context.begin();
- - }
- -
- if (!result->matched)
- memset(result->submatch, 0, sizeof result->submatch);
- }
- re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
- return true;
- }
-
- - // Check whether text uses only Unicode points <= 0xFFFF
- - // (in the BMP).
- - static bool IsBMP(const StringPiece& text) {
- - const char* p = text.begin();
- - const char* ep = text.end();
- - while (p < ep) {
- - if (!fullrune(p, ep - p))
- - return false;
- - Rune r;
- - p += chartorune(&r, p);
- - if (r > 0xFFFF)
- - return false;
- - }
- - return true;
- - }
- -
- // Runs a single test.
- bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
- Prog::Anchor anchor) {
- re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
- Result correct;
- RunSearch(kEngineBacktrack, text, context, anchor, &correct);
- if (correct.skipped) {
- - if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode
- + if (regexp_ == NULL)
- return true;
- LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
- << " " << FormatMode(flags_);
|