The Labs \ Source Viewer \ SSCLI \ System.Text.RegularExpressions \ RegexRunner

  1. //------------------------------------------------------------------------------
  2. // <copyright file="RegexRunner.cs" company="Microsoft">
  3. //
  4. // Copyright (c) 2006 Microsoft Corporation. All rights reserved.
  5. //
  6. // The use and distribution terms for this software are contained in the file
  7. // named license.txt, which can be found in the root of this distribution.
  8. // By using this software in any fashion, you are agreeing to be bound by the
  9. // terms of this license.
  10. //
  11. // You must not remove this notice, or any other, from this software.
  12. //
  13. // </copyright>
  14. //------------------------------------------------------------------------------
  15. // This RegexRunner class is a base class for compiled regex code.
  16. // Implementation notes:
  17. //
  18. // RegexRunner provides a common calling convention and a common
  19. // runtime environment for the interpreter and the compiled code.
  20. //
  21. // It provides the driver code that call's the subclass's Go()
  22. // method for either scanning or direct execution.
  23. //
  24. // It also maintains memory allocation for the backtracking stack,
  25. // the grouping stack and the longjump crawlstack, and provides
  26. // methods to push new subpattern match results into (or remove
  27. // backtracked results from) the Match instance.
  28. namespace System.Text.RegularExpressions
  29. {
  30.    
  31.     using System.Collections;
  32.     using System.Diagnostics;
  33.     using System.ComponentModel;
  34.     using System.Globalization;
  35.    
  36.     /// <internalonly/>
  37.     [EditorBrowsable(EditorBrowsableState.Never)]
  38.     public abstract class RegexRunner
  39.     {
  40.         protected internal int runtextbeg;
  41.         // beginning of text to search
  42.         protected internal int runtextend;
  43.         // end of text to search
  44.         protected internal int runtextstart;
  45.         // starting point for search
  46.         protected internal string runtext;
  47.         // text to search
  48.         protected internal int runtextpos;
  49.         // current position in text
  50.         protected internal int[] runtrack;
  51.         // The backtracking stack. Opcodes use this to store data regarding
  52.         protected internal int runtrackpos;
  53.         // what they have matched and where to backtrack to. Each "frame" on
  54.         // the stack takes the form of [CodePosition Data1 Data2...], where
  55.         // CodePosition is the position of the current opcode and
  56.         // the data values are all optional. The CodePosition can be negative, and
  57.         // these values (also called "back2") are used by the BranchMark family of opcodes
  58.         // to indicate whether they are backtracking after a successful or failed
  59.         // match.
  60.         // When we backtrack, we pop the CodePosition off the stack, set the current
  61.         // instruction pointer to that code position, and mark the opcode
  62.         // with a backtracking flag ("Back"). Each opcode then knows how to
  63.         // handle its own data.
  64.         protected internal int[] runstack;
  65.         // This stack is used to track text positions across different opcodes.
  66.         protected internal int runstackpos;
  67.         // For example, in /(a*b)+/, the parentheses result in a SetMark/CaptureMark
  68.         // pair. SetMark records the text position before we match a*b. Then
  69.         // CaptureMark uses that position to figure out where the capture starts.
  70.         // Opcodes which push onto this stack are always paired with other opcodes
  71.         // which will pop the value from it later. A successful match should mean
  72.         // that this stack is empty.
  73.         protected internal int[] runcrawl;
  74.         // The crawl stack is used to keep track of captures. Every time a group
  75.         protected internal int runcrawlpos;
  76.         // has a capture, we push its group number onto the runcrawl stack. In
  77.         // the case of a balanced match, we push BOTH groups onto the stack.
  78.         protected internal int runtrackcount;
  79.         // count of states that may do backtracking
  80.         protected internal Match runmatch;
  81.         // result object
  82.         protected internal Regex runregex;
  83.         // regex object
  84.         protected internal RegexRunner()
  85.         {
  86.         }
  87.        
  88. /*
  89.         * Scans the string to find the first match. Uses the Match object
  90.         * both to feed text in and as a place to store matches that come out.
  91.         *
  92.         * All the action is in the abstract Go() method defined by subclasses. Our
  93.         * responsibility is to load up the class members (as done here) before
  94.         * calling Go.
  95.         *
  96.         * <CONSIDER>the optimizer can compute a set of candidate starting characters,
  97.         * and we should have a separate method Skip() that will quickly scan past
  98.         * any characters that we know can't match.</CONSIDER>
  99.         *
  100.         * <CONSIDER>we should be aware of ^ or .* anchored searches and not iterate.</CONSIDER>
  101.         */       
  102.         protected internal Match Scan(Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick)
  103.         {
  104.             int bump;
  105.             int stoppos;
  106.             bool initted = false;
  107.            
  108.             runregex = regex;
  109.             runtext = text;
  110.             runtextbeg = textbeg;
  111.             runtextend = textend;
  112.             runtextstart = textstart;
  113.            
  114.             bump = runregex.RightToLeft ? -1 : 1;
  115.             stoppos = runregex.RightToLeft ? runtextbeg : runtextend;
  116.            
  117.             runtextpos = textstart;
  118.            
  119.             // If previous match was empty or failed, advance by one before matching
  120.            
  121.             if (prevlen == 0) {
  122.                 if (runtextpos == stoppos)
  123.                     return Match.Empty;
  124.                
  125.                 runtextpos += bump;
  126.             }
  127.            
  128.             for (;;) {
  129.                 #if DBG
  130.                 if (runregex.Debug) {
  131.                     Debug.WriteLine("");
  132.                     Debug.WriteLine("Search range: from " + runtextbeg.ToString(CultureInfo.InvariantCulture) + " to " + runtextend.ToString(CultureInfo.InvariantCulture));
  133.                     Debug.WriteLine("Firstchar search starting at " + runtextpos.ToString(CultureInfo.InvariantCulture) + " stopping at " + stoppos.ToString(CultureInfo.InvariantCulture));
  134.                 }
  135.                 #endif
  136.                 if (FindFirstChar()) {
  137.                     if (!initted) {
  138.                         InitMatch();
  139.                         initted = true;
  140.                     }
  141.                     #if DBG
  142.                     if (runregex.Debug) {
  143.                         Debug.WriteLine("Executing engine starting at " + runtextpos.ToString(CultureInfo.InvariantCulture));
  144.                         Debug.WriteLine("");
  145.                     }
  146.                     #endif
  147.                     Go();
  148.                    
  149.                     if (runmatch._matchcount[0] > 0) {
  150.                         return TidyMatch(quick);
  151.                     }
  152.                    
  153.                     // reset state for another go
  154.                     runtrackpos = runtrack.Length;
  155.                     runstackpos = runstack.Length;
  156.                     runcrawlpos = runcrawl.Length;
  157.                 }
  158.                
  159.                 // failure!
  160.                
  161.                 if (runtextpos == stoppos) {
  162.                     TidyMatch(true);
  163.                     return Match.Empty;
  164.                 }
  165.                
  166.                
  167.                
  168.                 runtextpos += bump;
  169.             }
  170.            
  171.         }
  172.        
  173. /*
  174.         * The responsibility of Go() is to run the regular expression at
  175.         * runtextpos and call Capture() on all the captured subexpressions,
  176.         * then to leave runtextpos at the ending position. It should leave
  177.         * runtextpos where it started if there was no match.
  178.         */       
  179.         protected abstract void Go();
  180.        
  181. /*
  182.         * The responsibility of FindFirstChar() is to advance runtextpos
  183.         * until it is at the next position which is a candidate for the
  184.         * beginning of a successful match.
  185.         */       
  186.         protected abstract bool FindFirstChar();
  187.        
  188. /*
  189.         * InitTrackCount must initialize the runtrackcount field; this is
  190.         * used to know how large the initial runtrack and runstack arrays
  191.         * must be.
  192.         */       
  193.         protected abstract void InitTrackCount();
  194.        
  195. /*
  196.         * Initializes all the data members that are used by Go()
  197.         */       
  198.         private void InitMatch()
  199.         {
  200.             // Use a hashtable'ed Match object if the capture numbers are sparse
  201.            
  202.             if (runmatch == null) {
  203.                 if (runregex.caps != null)
  204.                     runmatch = new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext, runtextbeg, runtextend - runtextbeg, runtextstart);
  205.                 else
  206.                     runmatch = new Match(runregex, runregex.capsize, runtext, runtextbeg, runtextend - runtextbeg, runtextstart);
  207.             }
  208.             else {
  209.                 runmatch.Reset(runregex, runtext, runtextbeg, runtextend, runtextstart);
  210.             }
  211.            
  212.             // note we test runcrawl, because it is the last one to be allocated
  213.             // If there is an alloc failure in the middle of the three allocations,
  214.             // we may still return to reuse this instance, and we want to behave
  215.             // as if the allocations didn't occur. (we used to test _trackcount != 0)
  216.            
  217.             if (runcrawl != null) {
  218.                 runtrackpos = runtrack.Length;
  219.                 runstackpos = runstack.Length;
  220.                 runcrawlpos = runcrawl.Length;
  221.                 return;
  222.             }
  223.            
  224.             InitTrackCount();
  225.            
  226.             int tracksize = runtrackcount * 8;
  227.             int stacksize = runtrackcount * 8;
  228.            
  229.             if (tracksize < 32)
  230.                 tracksize = 32;
  231.             if (stacksize < 16)
  232.                 stacksize = 16;
  233.            
  234.             runtrack = new int[tracksize];
  235.             runtrackpos = tracksize;
  236.            
  237.             runstack = new int[stacksize];
  238.             runstackpos = stacksize;
  239.            
  240.             runcrawl = new int[32];
  241.             runcrawlpos = 32;
  242.         }
  243.        
  244. /*
  245.         * Put match in its canonical form before returning it.
  246.         */       
  247.         private Match TidyMatch(bool quick)
  248.         {
  249.             if (!quick) {
  250.                 Match match = runmatch;
  251.                
  252.                 runmatch = null;
  253.                
  254.                 match.Tidy(runtextpos);
  255.                 return match;
  256.             }
  257.             else {
  258.                 // in quick mode, a successful match returns null, and
  259.                 // the allocated match object is left in the cache
  260.                
  261.                 return null;
  262.             }
  263.         }
  264.        
  265. /*
  266.         * Called by the implemenation of Go() to increase the size of storage
  267.         */       
  268.         protected void EnsureStorage()
  269.         {
  270.             if (runstackpos < runtrackcount * 4)
  271.                 DoubleStack();
  272.             if (runtrackpos < runtrackcount * 4)
  273.                 DoubleTrack();
  274.         }
  275.        
  276. /*
  277.         * Called by the implemenation of Go() to decide whether the pos
  278.         * at the specified index is a boundary or not. It's just not worth
  279.         * emitting inline code for this logic.
  280.         */       
  281.         protected bool IsBoundary(int index, int startpos, int endpos)
  282.         {
  283.             return (index > startpos && RegexCharClass.IsWordChar(runtext[index - 1])) != (index < endpos && RegexCharClass.IsWordChar(runtext[index]));
  284.         }
  285.        
  286.         protected bool IsECMABoundary(int index, int startpos, int endpos)
  287.         {
  288.             return (index > startpos && RegexCharClass.IsECMAWordChar(runtext[index - 1])) != (index < endpos && RegexCharClass.IsECMAWordChar(runtext[index]));
  289.         }
  290.        
  291.         protected static bool CharInSet(char ch, string set, string category)
  292.         {
  293.             string charClass = RegexCharClass.ConvertOldStringsToClass(set, category);
  294.             return RegexCharClass.CharInClass(ch, charClass);
  295.         }
  296.        
  297.         protected static bool CharInClass(char ch, string charClass)
  298.         {
  299.             return RegexCharClass.CharInClass(ch, charClass);
  300.         }
  301.        
  302. /*
  303.         * Called by the implemenation of Go() to increase the size of the
  304.         * backtracking stack.
  305.         */       
  306.         protected void DoubleTrack()
  307.         {
  308.             int[] newtrack;
  309.            
  310.             newtrack = new int[runtrack.Length * 2];
  311.            
  312.             System.Array.Copy(runtrack, 0, newtrack, runtrack.Length, runtrack.Length);
  313.             runtrackpos += runtrack.Length;
  314.             runtrack = newtrack;
  315.         }
  316.        
  317. /*
  318.         * Called by the implemenation of Go() to increase the size of the
  319.         * grouping stack.
  320.         */       
  321.         protected void DoubleStack()
  322.         {
  323.             int[] newstack;
  324.            
  325.             newstack = new int[runstack.Length * 2];
  326.            
  327.             System.Array.Copy(runstack, 0, newstack, runstack.Length, runstack.Length);
  328.             runstackpos += runstack.Length;
  329.             runstack = newstack;
  330.         }
  331.        
  332. /*
  333.         * Increases the size of the longjump unrolling stack.
  334.         */       
  335.         protected void DoubleCrawl()
  336.         {
  337.             int[] newcrawl;
  338.            
  339.             newcrawl = new int[runcrawl.Length * 2];
  340.            
  341.             System.Array.Copy(runcrawl, 0, newcrawl, runcrawl.Length, runcrawl.Length);
  342.             runcrawlpos += runcrawl.Length;
  343.             runcrawl = newcrawl;
  344.         }
  345.        
  346. /*
  347.         * Save a number on the longjump unrolling stack
  348.         */       
  349.         protected void Crawl(int i)
  350.         {
  351.             if (runcrawlpos == 0)
  352.                 DoubleCrawl();
  353.            
  354.             runcrawl[--runcrawlpos] = i;
  355.         }
  356.        
  357. /*
  358.         * Remove a number from the longjump unrolling stack
  359.         */       
  360.         protected int Popcrawl()
  361.         {
  362.             return runcrawl[runcrawlpos++];
  363.         }
  364.        
  365. /*
  366.         * Get the height of the stack
  367.         */       
  368.         protected int Crawlpos()
  369.         {
  370.             return runcrawl.Length - runcrawlpos;
  371.         }
  372.        
  373. /*
  374.         * Called by Go() to capture a subexpression. Note that the
  375.         * capnum used here has already been mapped to a non-sparse
  376.         * index (by the code generator RegexWriter).
  377.         */       
  378.         protected void Capture(int capnum, int start, int end)
  379.         {
  380.             if (end < start) {
  381.                 int T;
  382.                
  383.                 T = end;
  384.                 end = start;
  385.                 start = T;
  386.             }
  387.            
  388.             Crawl(capnum);
  389.             runmatch.AddMatch(capnum, start, end - start);
  390.         }
  391.        
  392. /*
  393.         * Called by Go() to capture a subexpression. Note that the
  394.         * capnum used here has already been mapped to a non-sparse
  395.         * index (by the code generator RegexWriter).
  396.         */       
  397.         protected void TransferCapture(int capnum, int uncapnum, int start, int end)
  398.         {
  399.             int start2;
  400.             int end2;
  401.            
  402.             // these are the two intervals that are cancelling each other
  403.            
  404.             if (end < start) {
  405.                 int T;
  406.                
  407.                 T = end;
  408.                 end = start;
  409.                 start = T;
  410.             }
  411.            
  412.             start2 = MatchIndex(uncapnum);
  413.             end2 = start2 + MatchLength(uncapnum);
  414.            
  415.             // The new capture gets the innermost defined interval
  416.            
  417.             if (start >= end2) {
  418.                 end = start;
  419.                 start = end2;
  420.             }
  421.             else if (end <= start2) {
  422.                 start = start2;
  423.             }
  424.             else {
  425.                 if (end > end2)
  426.                     end = end2;
  427.                 if (start2 > start)
  428.                     start = start2;
  429.             }
  430.            
  431.             Crawl(uncapnum);
  432.             runmatch.BalanceMatch(uncapnum);
  433.            
  434.             if (capnum != -1) {
  435.                 Crawl(capnum);
  436.                 runmatch.AddMatch(capnum, start, end - start);
  437.             }
  438.         }
  439.        
  440. /*
  441.         * Called by Go() to revert the last capture
  442.         */       
  443.         protected void Uncapture()
  444.         {
  445.             int capnum = Popcrawl();
  446.             runmatch.RemoveMatch(capnum);
  447.         }
  448.        
  449. /*
  450.         * Call out to runmatch to get around visibility issues
  451.         */       
  452.         protected bool IsMatched(int cap)
  453.         {
  454.             return runmatch.IsMatched(cap);
  455.         }
  456.        
  457. /*
  458.         * Call out to runmatch to get around visibility issues
  459.         */       
  460.         protected int MatchIndex(int cap)
  461.         {
  462.             return runmatch.MatchIndex(cap);
  463.         }
  464.        
  465. /*
  466.         * Call out to runmatch to get around visibility issues
  467.         */       
  468.         protected int MatchLength(int cap)
  469.         {
  470.             return runmatch.MatchLength(cap);
  471.         }
  472.        
  473.         #if DBG
  474. /*
  475.         * Dump the current state
  476.         */       
  477.         internal virtual void DumpState()
  478.         {
  479.             Debug.WriteLine("Text: " + TextposDescription());
  480.             Debug.WriteLine("Track: " + StackDescription(runtrack, runtrackpos));
  481.             Debug.WriteLine("Stack: " + StackDescription(runstack, runstackpos));
  482.         }
  483.        
  484.         static internal string StackDescription(int[] A, int Index)
  485.         {
  486.             StringBuilder Sb = new StringBuilder();
  487.            
  488.             Sb.Append(A.Length - Index);
  489.             Sb.Append('/');
  490.             Sb.Append(A.Length);
  491.            
  492.             if (Sb.Length < 8)
  493.                 Sb.Append(' ', 8 - Sb.Length);
  494.            
  495.             Sb.Append("(");
  496.            
  497.             for (int i = Index; i < A.Length; i++) {
  498.                 if (i > Index)
  499.                     Sb.Append(' ');
  500.                 Sb.Append(A[i]);
  501.             }
  502.            
  503.             Sb.Append(')');
  504.            
  505.             return Sb.ToString();
  506.         }
  507.        
  508.         internal virtual string TextposDescription()
  509.         {
  510.             StringBuilder Sb = new StringBuilder();
  511.             int remaining;
  512.            
  513.             Sb.Append(runtextpos);
  514.            
  515.             if (Sb.Length < 8)
  516.                 Sb.Append(' ', 8 - Sb.Length);
  517.            
  518.             if (runtextpos > runtextbeg)
  519.                 Sb.Append(RegexCharClass.CharDescription(runtext[runtextpos - 1]));
  520.             else
  521.                 Sb.Append('^');
  522.            
  523.             Sb.Append('>');
  524.            
  525.             remaining = runtextend - runtextpos;
  526.            
  527.             for (int i = runtextpos; i < runtextend; i++) {
  528.                 Sb.Append(RegexCharClass.CharDescription(runtext[i]));
  529.             }
  530.             if (Sb.Length >= 64) {
  531.                 Sb.Length = 61;
  532.                 Sb.Append("...");
  533.             }
  534.             else {
  535.                 Sb.Append('$');
  536.             }
  537.            
  538.             return Sb.ToString();
  539.         }
  540.         #endif
  541.     }
  542.    
  543.    
  544.    
  545. }

Developer Fusion