ucs2.diff 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. This is a dump from Google's source control system of the change
  2. that removed UCS-2 support from RE2. As the explanation below
  3. says, UCS-2 mode is fundamentally at odds with things like ^ and $,
  4. so it never really worked very well. But if you are interested in using
  5. it without those operators, it did work for that. It assumed that the
  6. UCS-2 data was in the native host byte order.
  7. If you are interested in adding UCS-2 mode back, this patch might
  8. be a good starting point.
  9. Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15
  10. Retire UCS-2 mode.
  11. I added it as an experiment for V8, but it
  12. requires 2-byte lookahead to do completely,
  13. and RE2 has 1-byte lookahead (enough for UTF-8)
  14. as a fairly deep fundamental assumption,
  15. so it did not support ^ or $.
  16. ==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
  17. re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
  18. cap_[0] = p;
  19. if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
  20. return true;
  21. - if (prog_->flags() & Regexp::UCS2)
  22. - p++;
  23. }
  24. return false;
  25. }
  26. ==== re2/compile.cc#17 - re2/compile.cc#18 ====
  27. re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
  28. // Input encodings.
  29. enum Encoding {
  30. kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
  31. - kEncodingUCS2, // UCS-2 (0-FFFF), native byte order
  32. kEncodingLatin1, // Latin1 (0-FF)
  33. };
  34. re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
  35. void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
  36. void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
  37. void Add_80_10ffff();
  38. - void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
  39. - void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
  40. - uint8 lo2, uint8 hi2, bool fold2);
  41. // New suffix that matches the byte range lo-hi, then goes to next.
  42. Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
  43. re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477
  44. // Converts rune range lo-hi into a fragment that recognizes
  45. // the bytes that would make up those runes in the current
  46. - // encoding (Latin 1, UTF-8, or UCS-2).
  47. + // encoding (Latin 1 or UTF-8).
  48. // This lets the machine work byte-by-byte even when
  49. // using multibyte encodings.
  50. re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
  51. case kEncodingLatin1:
  52. AddRuneRangeLatin1(lo, hi, foldcase);
  53. break;
  54. - case kEncodingUCS2:
  55. - AddRuneRangeUCS2(lo, hi, foldcase);
  56. - break;
  57. }
  58. }
  59. re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
  60. AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
  61. }
  62. - // Test whether 16-bit values are big or little endian.
  63. - static bool BigEndian() {
  64. - union {
  65. - char byte[2];
  66. - int16 endian;
  67. - } u;
  68. -
  69. - u.byte[0] = 1;
  70. - u.byte[1] = 2;
  71. - return u.endian == 0x0102;
  72. - }
  73. -
  74. - void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
  75. - uint8 lo2, uint8 hi2, bool fold2) {
  76. - Inst* ip;
  77. - if (reversed_) {
  78. - ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
  79. - ip = RuneByteSuffix(lo2, hi2, fold2, ip);
  80. - } else {
  81. - ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
  82. - ip = RuneByteSuffix(lo1, hi1, fold1, ip);
  83. - }
  84. - AddSuffix(ip);
  85. - }
  86. -
  87. - void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
  88. - if (lo > hi || lo > 0xFFFF)
  89. - return;
  90. - if (hi > 0xFFFF)
  91. - hi = 0xFFFF;
  92. -
  93. - // We'll assemble a pattern assuming big endian.
  94. - // If the machine isn't, tell Cat to reverse its arguments.
  95. - bool oldreversed = reversed_;
  96. - if (!BigEndian()) {
  97. - reversed_ = !oldreversed;
  98. - }
  99. -
  100. - // Split into bytes.
  101. - int lo1 = lo >> 8;
  102. - int lo2 = lo & 0xFF;
  103. - int hi1 = hi >> 8;
  104. - int hi2 = hi & 0xFF;
  105. -
  106. - if (lo1 == hi1) {
  107. - // Easy case: high bits are same in both.
  108. - // Only do ASCII case folding on the second byte if the top byte is 00.
  109. - AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
  110. - } else {
  111. - // Harder case: different second byte ranges depending on first byte.
  112. -
  113. - // Initial fragment.
  114. - if (lo2 > 0) {
  115. - AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
  116. - lo1++;
  117. - }
  118. -
  119. - // Trailing fragment.
  120. - if (hi2 < 0xFF) {
  121. - AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
  122. - hi1--;
  123. - }
  124. -
  125. - // Inner ranges.
  126. - if (lo1 <= hi1) {
  127. - AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
  128. - }
  129. - }
  130. -
  131. - // Restore reverse setting.
  132. - reversed_ = oldreversed;
  133. - }
  134. -
  135. // Table describing how to make a UTF-8 matching machine
  136. // for the rune range 80-10FFFF (Runeself-Runemax).
  137. // This range happens frequently enough (for example /./ and /[^a-z]/)
  138. re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634
  139. Frag Compiler::Literal(Rune r, bool foldcase) {
  140. switch (encoding_) {
  141. - default: // UCS-2 or something new
  142. - BeginRange();
  143. - AddRuneRange(r, r, foldcase);
  144. - return EndRange();
  145. + default:
  146. + return kNullFrag;
  147. case kEncodingLatin1:
  148. return ByteRange(r, r, foldcase);
  149. re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850
  150. if (re->parse_flags() & Regexp::Latin1)
  151. c.encoding_ = kEncodingLatin1;
  152. - else if (re->parse_flags() & Regexp::UCS2)
  153. - c.encoding_ = kEncodingUCS2;
  154. c.reversed_ = reversed;
  155. if (max_mem <= 0) {
  156. c.max_inst_ = 100000; // more than enough
  157. re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
  158. c.prog_->set_start_unanchored(c.prog_->start());
  159. } else {
  160. Frag dot;
  161. - if (c.encoding_ == kEncodingUCS2) {
  162. - dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
  163. - } else {
  164. - dot = c.ByteRange(0x00, 0xFF, false);
  165. - }
  166. + dot = c.ByteRange(0x00, 0xFF, false);
  167. Frag dotloop = c.Star(dot, true);
  168. Frag unanchored = c.Cat(dotloop, all);
  169. c.prog_->set_start_unanchored(unanchored.begin);
  170. ==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
  171. re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
  172. const char* bp = context.begin();
  173. int c = -1;
  174. int wasword = 0;
  175. - bool ucs2 = prog_->flags() & Regexp::UCS2;
  176. if (text.begin() > context.begin()) {
  177. c = text.begin()[-1] & 0xFF;
  178. re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
  179. // If there's a required first byte for an unanchored search
  180. // and we're not in the middle of any possible matches,
  181. // use memchr to search for the byte quickly.
  182. - if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
  183. + if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
  184. p < text.end() && (p[0] & 0xFF) != first_byte_) {
  185. p = reinterpret_cast<const char*>(memchr(p, first_byte_,
  186. text.end() - p));
  187. re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
  188. flag = Prog::EmptyFlags(context, p);
  189. }
  190. - // In UCS-2 mode, if we need to start a new thread,
  191. - // make sure to do it on an even boundary.
  192. - if(ucs2 && runq->size() == 0 &&
  193. - (p - context.begin()) % 2 && p < text.end()) {
  194. - p++;
  195. - flag = Prog::EmptyFlags(context, p);
  196. - }
  197. -
  198. // Steal match storage (cleared but unused as of yet)
  199. // temporarily to hold match boundaries for new thread.
  200. - // In UCS-2 mode, only start the thread on a 2-byte boundary.
  201. - if(!ucs2 || (p - context.begin()) % 2 == 0) {
  202. - match_[0] = p;
  203. - AddToThreadq(runq, start_, flag, p, match_);
  204. - match_[0] = NULL;
  205. - }
  206. + match_[0] = p;
  207. + AddToThreadq(runq, start_, flag, p, match_);
  208. + match_[0] = NULL;
  209. }
  210. // If all the threads have died, stop early.
  211. ==== re2/parse.cc#22 - re2/parse.cc#23 ====
  212. re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
  213. status_(status), stacktop_(NULL), ncap_(0) {
  214. if (flags_ & Latin1)
  215. rune_max_ = 0xFF;
  216. - else if (flags & UCS2)
  217. - rune_max_ = 0xFFFF;
  218. else
  219. rune_max_ = Runemax;
  220. }
  221. re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
  222. bool Regexp::ParseState::PushCarat() {
  223. if (flags_ & OneLine) {
  224. return PushSimpleOp(kRegexpBeginText);
  225. - } else {
  226. - if (flags_ & UCS2) {
  227. - status_->set_code(kRegexpUnsupported);
  228. - status_->set_error_arg("multiline ^ in UCS-2 mode");
  229. - return false;
  230. - }
  231. - return PushSimpleOp(kRegexpBeginLine);
  232. }
  233. + return PushSimpleOp(kRegexpBeginLine);
  234. }
  235. // Pushes a \b or \B onto the stack.
  236. bool Regexp::ParseState::PushWordBoundary(bool word) {
  237. - if (flags_ & UCS2) {
  238. - status_->set_code(kRegexpUnsupported);
  239. - status_->set_error_arg("\\b or \\B in UCS-2 mode");
  240. - return false;
  241. - }
  242. if (word)
  243. return PushSimpleOp(kRegexpWordBoundary);
  244. return PushSimpleOp(kRegexpNoWordBoundary);
  245. re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
  246. bool ret = PushSimpleOp(kRegexpEndText);
  247. flags_ = oflags;
  248. return ret;
  249. - }
  250. - if (flags_ & UCS2) {
  251. - status_->set_code(kRegexpUnsupported);
  252. - status_->set_error_arg("multiline $ in UCS-2 mode");
  253. - return false;
  254. }
  255. return PushSimpleOp(kRegexpEndLine);
  256. }
  257. ==== re2/re2.cc#34 - re2/re2.cc#35 ====
  258. re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
  259. return RE2::ErrorBadUTF8;
  260. case re2::kRegexpBadNamedCapture:
  261. return RE2::ErrorBadNamedCapture;
  262. - case re2::kRegexpUnsupported:
  263. - return RE2::ErrorUnsupported;
  264. }
  265. return RE2::ErrorInternal;
  266. }
  267. re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
  268. break;
  269. case RE2::Options::EncodingLatin1:
  270. flags |= Regexp::Latin1;
  271. - break;
  272. - case RE2::Options::EncodingUCS2:
  273. - flags |= Regexp::UCS2;
  274. break;
  275. }
  276. ==== re2/re2.h#36 - re2/re2.h#37 ====
  277. re2/re2.h#36:246,252 - re2/re2.h#37:246,251
  278. ErrorBadUTF8, // invalid UTF-8 in regexp
  279. ErrorBadNamedCapture, // bad named capture group
  280. ErrorPatternTooLarge, // pattern too large (compile failed)
  281. - ErrorUnsupported, // unsupported feature (in UCS-2 mode)
  282. };
  283. // Predefined common options.
  284. re2/re2.h#36:570,576 - re2/re2.h#37:569,574
  285. enum Encoding {
  286. EncodingUTF8 = 1,
  287. - EncodingUCS2, // 16-bit Unicode 0-FFFF only
  288. EncodingLatin1
  289. };
  290. ==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
  291. re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
  292. // the regexp that remains after the prefix. The prefix might
  293. // be ASCII case-insensitive.
  294. bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
  295. - // Don't even bother for UCS-2; it's time to throw that code away.
  296. - if (parse_flags_ & UCS2)
  297. - return false;
  298. -
  299. // No need for a walker: the regexp must be of the form
  300. // 1. some number of ^ anchors
  301. // 2. a literal char or string
  302. ==== re2/regexp.h#20 - re2/regexp.h#21 ====
  303. re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
  304. kRegexpBadPerlOp, // bad perl operator
  305. kRegexpBadUTF8, // invalid UTF-8 in regexp
  306. kRegexpBadNamedCapture, // bad named capture
  307. - kRegexpUnsupported, // unsupported operator
  308. };
  309. // Error status for certain operations.
  310. re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
  311. // \Q and \E to disable/enable metacharacters
  312. // (?P<name>expr) for named captures
  313. // \C to match any single byte
  314. - UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8.
  315. - UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
  316. + UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
  317. // and \P{Han} for its negation.
  318. - NeverNL = 1<<12, // Never match NL, even if the regexp mentions
  319. + NeverNL = 1<<11, // Never match NL, even if the regexp mentions
  320. // it explicitly.
  321. // As close to Perl as we can get.
  322. ==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
  323. re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
  324. cap_[0] = p;
  325. if (Visit(prog_->start(), p)) // Match must be leftmost; done.
  326. return true;
  327. - if (prog_->flags() & Regexp::UCS2)
  328. - p++;
  329. }
  330. return false;
  331. }
  332. ==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
  333. re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
  334. static ParseMode parse_modes[] = {
  335. { single_line, "single-line" },
  336. { single_line|Regexp::Latin1, "single-line, latin1" },
  337. - { single_line|Regexp::UCS2, "single-line, ucs2" },
  338. { multi_line, "multiline" },
  339. { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
  340. { multi_line|Regexp::Latin1, "multiline, latin1" },
  341. - { multi_line|Regexp::UCS2, "multiline, ucs2" },
  342. };
  343. static string FormatMode(Regexp::ParseFlags flags) {
  344. re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
  345. RegexpStatus status;
  346. regexp_ = Regexp::Parse(regexp_str, flags, &status);
  347. if (regexp_ == NULL) {
  348. - if (status.code() != kRegexpUnsupported) {
  349. - LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
  350. - << " mode: " << FormatMode(flags);
  351. - error_ = true;
  352. - }
  353. + LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
  354. + << " mode: " << FormatMode(flags);
  355. + error_ = true;
  356. return;
  357. }
  358. prog_ = regexp_->CompileToProg(0);
  359. re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
  360. RE2::Options options;
  361. if (flags & Regexp::Latin1)
  362. options.set_encoding(RE2::Options::EncodingLatin1);
  363. - else if (flags & Regexp::UCS2)
  364. - options.set_encoding(RE2::Options::EncodingUCS2);
  365. if (kind_ == Prog::kLongestMatch)
  366. options.set_longest_match(true);
  367. re2_ = new RE2(re, options);
  368. re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
  369. delete re2_;
  370. }
  371. - // Converts UTF-8 string in text into UCS-2 string in new_text.
  372. - static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
  373. - const char* p = text.begin();
  374. - const char* ep = text.end();
  375. - uint16* q = new uint16[ep - p];
  376. - uint16* q0 = q;
  377. -
  378. - int n;
  379. - Rune r;
  380. - for (; p < ep; p += n) {
  381. - if (!fullrune(p, ep - p)) {
  382. - delete[] q0;
  383. - return false;
  384. - }
  385. - n = chartorune(&r, p);
  386. - if (r > 0xFFFF) {
  387. - delete[] q0;
  388. - return false;
  389. - }
  390. - *q++ = r;
  391. - }
  392. - *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
  393. - return true;
  394. - }
  395. -
  396. - // Rewrites *sp from being a pointer into text8 (UTF-8)
  397. - // to being a pointer into text16 (equivalent text but in UCS-2).
  398. - static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
  399. - StringPiece *sp) {
  400. - if (sp->begin() == NULL && text8.begin() != NULL)
  401. - return;
  402. -
  403. - int nrune = 0;
  404. - int n;
  405. - Rune r;
  406. - const char* p = text8.begin();
  407. - const char* ep = text8.end();
  408. - const char* spbegin = NULL;
  409. - const char* spend = NULL;
  410. - for (;;) {
  411. - if (p == sp->begin())
  412. - spbegin = text16.begin() + sizeof(uint16)*nrune;
  413. - if (p == sp->end())
  414. - spend = text16.begin() + sizeof(uint16)*nrune;
  415. - if (p >= ep)
  416. - break;
  417. - n = chartorune(&r, p);
  418. - p += n;
  419. - nrune++;
  420. - }
  421. - if (spbegin == NULL || spend == NULL) {
  422. - LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
  423. - << CEscape(text8) << " "
  424. - << (int)(sp->begin() - text8.begin()) << " "
  425. - << (int)(sp->end() - text8.begin());
  426. - }
  427. - *sp = StringPiece(spbegin, spend - spbegin);
  428. - }
  429. -
  430. - // Rewrites *sp from begin a pointer into text16 (UCS-2)
  431. - // to being a pointer into text8 (equivalent text but in UTF-8).
  432. - static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
  433. - StringPiece* sp) {
  434. - if (sp->begin() == NULL)
  435. - return;
  436. -
  437. - int nrune = 0;
  438. - int n;
  439. - Rune r;
  440. - const char* p = text8.begin();
  441. - const char* ep = text8.end();
  442. - const char* spbegin = NULL;
  443. - const char* spend = NULL;
  444. - for (;;) {
  445. - if (nrune == (sp->begin() - text16.begin())/2)
  446. - spbegin = p;
  447. - if (nrune == (sp->end() - text16.begin())/2)
  448. - spend = p;
  449. - if (p >= ep)
  450. - break;
  451. - n = chartorune(&r, p);
  452. - p += n;
  453. - nrune++;
  454. - }
  455. - if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
  456. - LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
  457. - << CEscape(text16) << " "
  458. - << (int)(sp->begin() - text16.begin()) << " "
  459. - << (int)(sp->end() - text16.begin());
  460. - }
  461. - *sp = StringPiece(spbegin, spend - spbegin);
  462. - }
  463. -
  464. // Runs a single search using the named engine type.
  465. // This interface hides all the irregularities of the various
  466. // engine interfaces from the rest of this file.
  467. re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300
  468. StringPiece text = orig_text;
  469. StringPiece context = orig_context;
  470. - bool ucs2 = false;
  471. - if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
  472. - if (!ConvertUTF8ToUCS2(orig_context, &context)) {
  473. - result->skipped = true;
  474. - return;
  475. - }
  476. -
  477. - // Rewrite context to refer to new text.
  478. - AdjustUTF8ToUCS2(orig_context, context, &text);
  479. - ucs2 = true;
  480. - }
  481. -
  482. switch (type) {
  483. default:
  484. LOG(FATAL) << "Bad RunSearch type: " << (int)type;
  485. re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
  486. }
  487. }
  488. - // If we did UCS-2 matching, rewrite the matches to refer
  489. - // to the original UTF-8 text.
  490. - if (ucs2) {
  491. - if (result->matched) {
  492. - if (result->have_submatch0) {
  493. - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
  494. - } else if (result->have_submatch) {
  495. - for (int i = 0; i < nsubmatch; i++) {
  496. - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
  497. - }
  498. - }
  499. - }
  500. - delete[] context.begin();
  501. - }
  502. -
  503. if (!result->matched)
  504. memset(result->submatch, 0, sizeof result->submatch);
  505. }
  506. re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
  507. return true;
  508. }
  509. - // Check whether text uses only Unicode points <= 0xFFFF
  510. - // (in the BMP).
  511. - static bool IsBMP(const StringPiece& text) {
  512. - const char* p = text.begin();
  513. - const char* ep = text.end();
  514. - while (p < ep) {
  515. - if (!fullrune(p, ep - p))
  516. - return false;
  517. - Rune r;
  518. - p += chartorune(&r, p);
  519. - if (r > 0xFFFF)
  520. - return false;
  521. - }
  522. - return true;
  523. - }
  524. -
  525. // Runs a single test.
  526. bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
  527. Prog::Anchor anchor) {
  528. re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
  529. Result correct;
  530. RunSearch(kEngineBacktrack, text, context, anchor, &correct);
  531. if (correct.skipped) {
  532. - if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode
  533. + if (regexp_ == NULL)
  534. return true;
  535. LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
  536. << " " << FormatMode(flags_);