00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #ifndef EXTENDED_REGULAR_EXPRESSIONEXTENDED_REG_EXP_H
00021 #define EXTENDED_REGULAR_EXPRESSIONEXTENDED_REG_EXP_H
00022
00023 namespace extended_regular_expression {
00024
00043 template<typename E>
00044 class extended_reg_exp : public ere_base<E>
00045 {
00046 typedef typename grammar_to_parser::basic_parser<E>::parser_list
00047 parser_list;
00048
00049 class ere_branch_or_extended_reg_exp : public ere_base<E>
00050 {
00051 grammar_to_parser::basic_choice<E, ere_branch<E> > m_branch;
00052 grammar_to_parser::basic_terminal<E,'|'> m_or;
00053 grammar_to_parser::basic_choice<E, extended_reg_exp<E> >
00054 m_ext_regexp;
00055 public:
00056 ere_branch_or_extended_reg_exp() : ere_base<E>() {};
00057 ~ere_branch_or_extended_reg_exp() {};
00058
00059 unsigned long recognize( const E* buf,
00060 const unsigned long buf_length,
00061 const unsigned long buf_offset,
00062 bool try_positions = true );
00063 void assign_matches( matches::match_key parent_address,
00064 unsigned long& branch_pos,
00065 matches& m );
00066 void push_parsers( parser_list& l );
00067 };
00068
00069 grammar_to_parser::basic_choice<E, ere_branch<E> > m_branch;
00070 grammar_to_parser::basic_non_terminal<E, ere_branch_or_extended_reg_exp >
00071 m_branch_or_ext_regexp;
00072
00073 public:
00074
00075 extended_reg_exp() :
00076 ere_base<E>(),
00077 m_branch_or_ext_regexp() {};
00078 ~extended_reg_exp() {};
00079
00080 unsigned long recognize( const E* buf,
00081 const unsigned long buf_length,
00082 const unsigned long buf_offset,
00083 bool try_positions = true );
00084 void assign_matches( matches::match_key parent_address,
00085 unsigned long& branch_pos,
00086 matches& m );
00087
00088 void push_parsers( parser_list& l );
00089 };
00090
00091
00092
00093
00094 template< typename E >
00095 unsigned long ere_expression<E>::subexpression::recognize(
00096 const E* buf,
00097 const unsigned long buf_length,
00098 const unsigned long buf_offset,
00099 bool try_positions )
00100 {
00101 m_extened_regexp->recognize( buf, buf_length, buf_offset );
00102 m_is_rec = m_extened_regexp->is_recognized();
00103 if( m_is_rec )
00104 {
00105 m_rec_size = m_extened_regexp->recognize( buf, buf_length, buf_offset );
00106 m_rec_pos = m_extened_regexp->recognized_position();
00107 }
00108 return m_rec_size;
00109 }
00110
00111 template< typename E >
00112 void ere_expression<E>::subexpression::assign_matches(
00113 matches::match_key key,
00114 unsigned long& branch_pos,
00115 matches& m )
00116 {
00117 key += '.';
00118 key += to_str(branch_pos);
00119 unsigned long new_branch_pos = 0;
00120 m_extened_regexp->assign_matches( key, branch_pos, m );
00121 }
00122
00123
00124 template< typename E >
00125 void ere_expression<E>::subexpression::push_parsers( parser_list& l )
00126 {
00127 l.push_back( &m_left_paren );
00128 l.push_back( &m_extened_regexp );
00129 l.push_back( &m_right_paren );
00130 }
00131
00132 template< typename E >
00133 unsigned long ere_expression<E>::ere_expression_and_dupl::divide_and_recognize(
00134 const E* buf,
00135 const unsigned long buf_length,
00136 const unsigned long buf_offset,
00137 unsigned long rep,
00138 bool assign_matches )
00139 {
00140 m_is_rec = true;
00141 if( rep == m_dup->get_max() )
00142 {
00143 m_dup->set_recognized_dup(rep);
00144 return 0;
00145 }
00146 unsigned long best_left_side_buf_len = 0;
00147 unsigned long best_rec_size = 0;
00148 unsigned long left_side_buf_len = buf_length;
00149 unsigned long left_rec_size = 0;
00150 while( m_is_rec )
00151 {
00152 m_ere->recognize( buf, left_side_buf_len, buf_offset, true );
00153 if( m_ere->is_recognized() )
00154 {
00155 left_rec_size = m_ere->recognized_size();
00156
00157 m_rec_size = divide_and_recognize(
00158 buf,
00159 buf_length,
00160 buf_offset + left_rec_size,
00161 rep+1,
00162 false );
00163 if( m_is_rec )
00164 {
00165
00166 if( best_rec_size < m_rec_size + left_rec_size )
00167 {
00168 best_rec_size = m_rec_size + left_rec_size;
00169 best_left_side_buf_len = left_side_buf_len;
00170 }
00171 }
00172 left_side_buf_len = buf_offset + left_rec_size - 1;
00173 m_is_rec = true;
00174 }
00175 else
00176 {
00177 m_is_rec = false;
00178 }
00179 }
00180
00181 if( best_rec_size > 0 )
00182 {
00183 if( assign_matches )
00184 {
00185 m_ere->recognize( buf, best_left_side_buf_len, buf_offset, true );
00186 unsigned long branch_pos = rep;
00187 m_ere->assign_matches( "", branch_pos, m_matches );
00188
00189 m_rec_size = 0;
00190 left_rec_size = m_ere->recognized_size();
00191 divide_and_recognize(
00192 buf,
00193 buf_length,
00194 buf_offset + left_rec_size,
00195 rep+1,
00196 true );
00197 m_rec_size += left_rec_size;
00198 }
00199 else
00200 {
00201 m_is_rec = true;
00202 m_rec_size = best_rec_size;
00203 }
00204 }
00205 else if( rep >= m_dup->get_min() )
00206 {
00207 m_rec_size = 0;
00208 m_is_rec = true;
00209 }
00210 else
00211 {
00212 m_is_rec = false;
00213 }
00214 return m_rec_size;
00215 }
00216
00217
00218 template< typename E >
00219 unsigned long ere_expression<E>::ere_expression_and_dupl::recognize(
00220 const E* buf,
00221 const unsigned long buf_length,
00222 const unsigned long buf_offset,
00223 bool try_positions )
00224 {
00225 unsigned long rep = 1;
00226
00227 if( m_ere->is_subexpression() )
00228 {
00229 m_matches.clear();
00230
00231
00232
00233
00234 divide_and_recognize( buf,
00235 buf_length,
00236 buf_offset,
00237 0,
00238 true );
00239 m_rec_pos = buf_offset;
00240 return m_rec_size;
00241 }
00242
00243
00244 m_ere->recognize( buf, buf_length, buf_offset, try_positions );
00245 m_is_rec = m_ere->is_recognized();
00246 m_rec_pos = m_ere->recognized_position();
00247 m_rec_size = m_ere->recognized_size();
00248
00249 if( m_ere->is_recognized() && m_ere->is_subexpression() )
00250 {
00251 unsigned long branch_pos = 0;
00252 m_ere->assign_matches( "", branch_pos, m_matches );
00253 }
00254
00255 while( m_is_rec && rep < m_dup->get_max() )
00256 {
00257 m_ere->recognize( buf,
00258 buf_length,
00259 m_rec_pos+m_rec_size,
00260 false );
00261 if( m_ere->is_recognized() )
00262 {
00263 m_rec_size += m_ere->recognized_size();
00264 rep++;
00265 }
00266 else if( rep < m_dup->get_min() )
00267 {
00268 m_is_rec = false;
00269 m_rec_pos = 0;
00270 m_rec_size = 0;
00271 m_matches.clear();
00272 }
00273 else
00274 {
00275 break;
00276 }
00277 }
00278 m_dup->set_recognized_dup( rep );
00279 return m_rec_size;
00280 }
00281
00282 template< typename E >
00283 void ere_expression<E>::ere_expression_and_dupl::assign_matches(
00284 matches::match_key key,
00285 unsigned long& branch_pos,
00286 matches& m )
00287 {
00288 if( m_ere->is_subexpression() )
00289 {
00290
00291
00292 m.extend( key, m_matches );
00293 }
00294 else
00295 {
00296 for( int i = 0; i < m_dup->get_recognized_dup(); i++ )
00297 {
00298 m_ere->assign_matches( key, branch_pos, m );
00299 }
00300 }
00301 }
00302
00303 template< typename E >
00304 void ere_expression<E>::ere_expression_and_dupl::push_parsers( parser_list& l )
00305 {
00306 l.push_back( &m_ere );
00307 l.push_back( &m_dup );
00308 }
00309
00310 template<typename E>
00311 unsigned long ere_expression<E>::recognize(const E* buf,
00312 const unsigned long buf_length,
00313 const unsigned long buf_offset,
00314 bool try_positions)
00315 {
00316 m_is_rec = false;
00317 m_rec_pos = buf_offset;
00318 m_rec_size = 0;
00319
00320 if( m_one_elem.is_parsed() )
00321 {
00322 while( !m_is_rec && m_rec_pos < buf_length )
00323 {
00324 m_rec_size = m_one_elem->recognize( buf, buf_length, m_rec_pos );
00325 m_is_rec = m_rec_size > 0;
00326 if( m_is_rec )
00327 {
00328
00329 }
00330 else if( try_positions )
00331 {
00332 m_rec_pos++;
00333 }
00334 else
00335 {
00336 break;
00337 }
00338 }
00339 }
00340 else if( m_left_anchor.is_parsed() || m_right_anchor.is_parsed() )
00341 {
00342 m_is_rec = true;
00343 }
00344 else if( m_ere_expr_and_dup.is_parsed() )
00345 {
00346 m_ere_expr_and_dup->recognize( buf, buf_length, buf_offset, try_positions );
00347 m_is_rec = m_ere_expr_and_dup->is_recognized() ;
00348 if( m_is_rec )
00349 {
00350 m_rec_pos = m_ere_expr_and_dup->recognized_position();
00351 m_rec_size = m_ere_expr_and_dup->recognized_size();
00352 }
00353 }
00354 else
00355 {
00356 m_subexpression->recognize( buf, buf_length, buf_offset );
00357 m_is_rec = m_subexpression->is_recognized();
00358 m_rec_size = m_subexpression->recognized_size() +
00359 m_subexpression->recognized_position() -
00360 buf_offset;
00361 }
00362 return m_rec_size;
00363 }
00364
00365 template<typename E>
00366 void ere_expression<E>::assign_matches( matches::match_key key,
00367 unsigned long& branch_pos,
00368 matches& m )
00369 {
00370 if( m_subexpression.is_parsed() )
00371 {
00372 branch_pos++;
00373 m_subexpression->assign_matches( key, branch_pos, m );
00374 }
00375 else if( m_ere_expr_and_dup.is_parsed() )
00376 {
00377 m_ere_expr_and_dup->assign_matches( key, branch_pos, m );
00378 }
00379 }
00380
00381 template<typename E>
00382 void ere_expression<E>::push_parsers( parser_list& l )
00383 {
00384 l.push_back( &m_ere_expr_and_dup );
00385 l.push_back( &m_one_elem );
00386 l.push_back( &m_left_anchor );
00387 l.push_back( &m_right_anchor );
00388 l.push_back( &m_subexpression );
00389 }
00390
00391 template<typename E>
00392 unsigned long ere_branch<E>::ere_expression_ere_branch::recognize(
00393 const E* buf,
00394 const unsigned long buf_length,
00395 const unsigned long buf_offset,
00396 bool try_positions )
00397 {
00398 m_is_rec = false;
00399 m_rec_pos = 0;
00400 m_rec_size = 0;
00401 if( m_ere_branch->is_right_anchor() )
00402 {
00403
00404 m_ere_expression->recognize( buf, buf_length, buf_offset );
00405 if( m_ere_expression->is_recognized() )
00406 {
00407 m_rec_size = m_ere_expression->recognized_size();
00408 m_rec_pos = m_ere_expression->recognized_position();
00409 m_is_rec = ((m_rec_pos + m_rec_size) == buf_length);
00410 if( !m_is_rec )
00411 {
00412 m_rec_pos = 0;
00413 m_rec_size = 0;
00414 }
00415 }
00416 return m_rec_size;
00417 }
00418
00419 if( m_ere_expression->is_left_anchor() )
00420 {
00421 m_ere_branch->recognize( buf, buf_length, buf_offset, true );
00422 m_is_rec = m_ere_branch->is_recognized() &&
00423 (m_ere_branch->recognized_position() == buf_offset);
00424 if( m_is_rec )
00425 {
00426 m_rec_pos = buf_offset;
00427 m_rec_size = m_ere_branch->recognized_size();
00428 }
00429 return m_rec_size;
00430 }
00431
00432
00433 unsigned long buf_split = 0;
00434 unsigned long longest_recognized_size = 0;
00435 unsigned long buf_split_for_shortest_size = 0;
00436 unsigned long shortest_left_recognized_size = INT_MAX;
00437 unsigned long buf_split_for_longest_size = buf_split;
00438 bool left_side_recognized = true;
00439 m_is_rec = true;
00440
00441
00442
00443
00444 while( left_side_recognized && buf_split < buf_length - buf_offset )
00445 {
00446 m_ere_expression->recognize( buf, buf_length-buf_split, buf_offset );
00447 left_side_recognized = m_ere_expression->is_recognized();
00448 if( left_side_recognized )
00449 {
00450 m_rec_size = m_ere_expression->recognized_size();
00451 m_rec_pos = m_ere_expression->recognized_position();
00452
00453
00454 m_ere_branch->recognize(
00455 buf,
00456 buf_length,
00457 m_rec_pos + m_rec_size,
00458 false );
00459
00460 m_is_rec = m_ere_branch->is_recognized();
00461 if( m_is_rec )
00462 {
00463 buf_split = buf_length - m_rec_pos - m_rec_size;
00464 if( shortest_left_recognized_size > m_rec_size )
00465
00466 {
00467
00468 shortest_left_recognized_size = m_rec_size;
00469 buf_split_for_shortest_size = buf_split;
00470 }
00471
00472 m_rec_size += m_ere_branch->recognized_size();
00473 if( longest_recognized_size < m_rec_size )
00474 {
00475 buf_split_for_longest_size = buf_split;
00476 longest_recognized_size = m_rec_size;
00477 }
00478 }
00479
00480 buf_split++;
00481 }
00482 }
00483
00484
00485
00486 if( longest_recognized_size > 0 || shortest_left_recognized_size < INT_MAX )
00487 {
00488 if( m_ere_expression->is_lazy() )
00489 {
00490 m_ere_expression->recognize( buf,
00491 buf_length-buf_split_for_shortest_size,
00492 buf_offset );
00493 }
00494 else
00495 {
00496
00497 m_ere_expression->recognize( buf,
00498 buf_length-buf_split_for_longest_size,
00499 buf_offset );
00500 }
00501 m_rec_size = m_ere_expression->recognized_size();
00502 m_rec_pos = m_ere_expression->recognized_position();
00503
00504
00505 m_ere_branch->recognize(
00506 buf,
00507 buf_length,
00508 m_rec_pos + m_rec_size,
00509 false );
00510
00511 m_rec_size += m_ere_branch->recognized_size();
00512 m_is_rec = true;
00513
00514 }
00515 else
00516 {
00517 m_is_rec = false;
00518 }
00519 return m_rec_size;
00520 }
00521
00522 template<typename E>
00523 void ere_branch<E>::ere_expression_ere_branch::assign_matches(
00524 matches::match_key parent_address,
00525 unsigned long& branch_pos,
00526 matches& m )
00527 {
00528 if( m_ere_branch.is_parsed() && m_ere_expression.is_parsed() )
00529 {
00530 m_ere_expression->assign_matches( parent_address, branch_pos, m );
00531 m_ere_branch->assign_matches( parent_address, branch_pos, m );
00532 }
00533 }
00534
00535 template<typename E>
00536 void ere_branch<E>::ere_expression_ere_branch::push_parsers( parser_list& l )
00537 {
00538 l.push_back( &m_ere_expression );
00539 l.push_back( &m_ere_branch );
00540 }
00541
00542 template<typename E>
00543 unsigned long ere_branch<E>::recognize( const E* buf,
00544 const unsigned long buf_length,
00545 const unsigned long buf_offset,
00546 bool try_positions )
00547 {
00548 if( m_ere.is_parsed() )
00549 {
00550 m_ere->recognize( buf, buf_length, buf_offset, try_positions );
00551 m_is_rec = m_ere->is_recognized();
00552 m_rec_pos = m_ere->recognized_position();
00553 m_rec_size = m_ere->recognized_size();
00554 }
00555 else
00556 {
00557 m_ere_expression_ere_branch->recognize( buf, buf_length, buf_offset, try_positions );
00558 m_is_rec = m_ere_expression_ere_branch->is_recognized();
00559 m_rec_pos = m_ere_expression_ere_branch->recognized_position();
00560 m_rec_size = m_ere_expression_ere_branch->recognized_size();
00561 }
00562 return m_rec_size;
00563 }
00564
00565 template<typename E>
00566 void ere_branch<E>::assign_matches( matches::match_key parent_address,
00567 unsigned long& branch_pos,
00568 matches& m )
00569 {
00570 if( m_ere.is_parsed() )
00571 {
00572 m_ere->assign_matches( parent_address, branch_pos, m );
00573 }
00574 else
00575 {
00576 m_ere_expression_ere_branch->assign_matches( parent_address, branch_pos, m );
00577 }
00578 }
00579
00580 template<typename E>
00581 void ere_branch<E>::push_parsers( parser_list& l )
00582 {
00583 l.push_back( &m_ere_expression_ere_branch );
00584 l.push_back( &m_ere );
00585 }
00586
00587 template<typename E>
00588 unsigned long extended_reg_exp<E>::ere_branch_or_extended_reg_exp::recognize(
00589 const E* buf,
00590 const unsigned long buf_length,
00591 const unsigned long buf_offset,
00592 bool try_positions )
00593 {
00594 m_rec_size = 0;
00595 m_rec_pos = 0;
00596 m_ext_regexp->recognize( buf, buf_length, buf_offset );
00597 m_branch->recognize( buf, buf_length, buf_offset );
00598
00599 if( m_ext_regexp->is_recognized() && m_branch->is_recognized() )
00600 {
00601 if( m_branch->recognized_size() > m_ext_regexp->recognized_size() )
00602 {
00603 m_ext_regexp->unset();
00604 }
00605 else
00606 {
00607 m_branch->unset();
00608 }
00609 }
00610
00611 m_is_rec = true;
00612 if( m_ext_regexp->is_recognized() )
00613 {
00614 m_rec_pos = m_ext_regexp->recognized_position();
00615 m_rec_size = m_ext_regexp->recognized_size();
00616 }
00617 else if( m_branch->is_recognized() )
00618 {
00619 m_rec_pos = m_branch->recognized_position();
00620 m_rec_size = m_branch->recognized_size();
00621 }
00622 else
00623 {
00624 m_is_rec = false;
00625 }
00626 return m_rec_size;
00627 }
00628
00629 template<typename E>
00630 void extended_reg_exp<E>::ere_branch_or_extended_reg_exp::assign_matches(
00631 matches::match_key parent_address,
00632 unsigned long& branch_pos,
00633 matches& m )
00634 {
00635 if( m_branch->is_recognized() )
00636 {
00637 m_branch->assign_matches( parent_address, branch_pos, m );
00638 }
00639 else
00640 {
00641 m_ext_regexp->assign_matches( parent_address, branch_pos, m );
00642 }
00643 }
00644
00645 template<typename E>
00646 void extended_reg_exp<E>::ere_branch_or_extended_reg_exp::push_parsers(
00647 parser_list& l )
00648 {
00649 l.push_back( &m_branch );
00650 l.push_back( &m_or );
00651 l.push_back( &m_ext_regexp );
00652 }
00653
00654 template<typename E>
00655 unsigned long extended_reg_exp<E>::recognize( const E* buf,
00656 const unsigned long buf_length,
00657 const unsigned long buf_offset,
00658 bool try_positions )
00659 {
00660 m_rec_pos = 0;
00661 m_rec_size = 0;
00662 if( m_branch.is_parsed() )
00663 {
00664 m_branch->recognize( buf, buf_length, buf_offset );
00665 m_is_rec = m_branch->is_recognized();
00666 if( m_is_rec )
00667 {
00668 m_rec_pos = m_branch->recognized_position();
00669 m_rec_size = m_branch->recognized_size();
00670 }
00671 }
00672 else
00673 {
00674 m_branch_or_ext_regexp->recognize( buf, buf_length, buf_offset );
00675 m_is_rec = m_branch_or_ext_regexp->is_recognized();
00676 if( m_is_rec )
00677 {
00678 m_rec_pos = m_branch_or_ext_regexp->recognized_position();
00679 m_rec_size = m_branch_or_ext_regexp->recognized_size();
00680 }
00681 }
00682 return m_rec_size;
00683 }
00684
00685 template<typename E>
00686 void extended_reg_exp<E>::assign_matches( matches::match_key key,
00687 unsigned long& branch_pos,
00688 matches& m )
00689 {
00690 m.insert( key, match( m_rec_pos, m_rec_size ) );
00691 unsigned long new_branch_pos = 0;
00692 if( m_branch.is_parsed() )
00693 {
00694 m_branch->assign_matches( key, new_branch_pos, m );
00695 }
00696 else
00697 {
00698 m_branch_or_ext_regexp->assign_matches( key, new_branch_pos, m );
00699 }
00700 }
00701
00702 template<typename E>
00703 void extended_reg_exp<E>::push_parsers( parser_list& l )
00704 {
00705 l.push_back( &m_branch_or_ext_regexp );
00706 l.push_back( &m_branch );
00707 }
00708
00709 };
00710
00711 #endif