Refactor lexer to use regular expressions

2026-01-12 22:45:52 +08:00 · 2010-12-07 11:42:34 -08:00
parent e5e69d9b90
commit 23fc73081f
2 changed files with 56 additions and 103 deletions
--- a/src/parser.js
+++ b/src/parser.js
@@ -32,7 +32,7 @@ function lex(text, parseStringsForObjects){
      index = 0,
      json = [],
      ch,
-      lastCh = ':'; // can start regexp
+      lastCh = ':';

  while (index < text.length) {
    ch = text.charAt(index);
@@ -71,6 +71,9 @@ function lex(text, parseStringsForObjects){
    lastCh = ch;
  }
  return tokens;
+  
+  
+  //////////////////////////////////////////////

  function is(chars) {
    return chars.indexOf(ch) != -1;
@@ -95,10 +98,6 @@ function lex(text, parseStringsForObjects){
           'A' <= ch && ch <= 'Z' ||
           '_' == ch || ch == '$';
  }
-  function isExpOperator(ch) {
-    return ch == '-' || ch == '+' || isNumber(ch);
-  }
-
  function throwError(error, start, end) {
    end = end || index;
    throw Error("Lexer Error: " + error + " at column" +
@@ -107,103 +106,61 @@ function lex(text, parseStringsForObjects){
            " " + end) + 
        " in expression [" + text + "].");
  }
+  
+  function consume(regexp, processToken, errorMsg) {
+    var match = text.substr(index).match(regexp);
+    var token = {index: index};
+    var start = index;
+    if (!match) throwError(errorMsg);
+    index += match[0].length;
+    processToken(token, token.text = match[0], start);
+    tokens.push(token);
+  }

  function readNumber() {
-    var number = "";
-    var start = index;
-    while (index < text.length) {
-      var ch = lowercase(text.charAt(index));
-      if (ch == '.' || isNumber(ch)) {
-        number += ch;
-      } else {
-        var peekCh = peek();
-        if (ch == 'e' && isExpOperator(peekCh)) {
-          number += ch;
-        } else if (isExpOperator(ch) &&
-            peekCh && isNumber(peekCh) &&
-            number.charAt(number.length - 1) == 'e') {
-          number += ch;
-        } else if (isExpOperator(ch) &&
-            (!peekCh || !isNumber(peekCh)) &&
-            number.charAt(number.length - 1) == 'e') {
-          throwError('Invalid exponent');
-        } else {
-          break;
-        }
-      }
-      index++;
-    }
-    number = 1 * number;
-    tokens.push({index:start, text:number, json:true,
-      fn:function(){return number;}});
+    consume(/^(\d+)?(\.\d+)?([eE][+-]?\d+)?/, function(token, number){
+      token.text = number = 1 * number;
+      token.json = true;
+      token.fn = valueFn(number);
+    }, "Not a valid number");
  }
+  
  function readIdent() {
-    var ident = "";
-    var start = index;
-    var fn;
-    while (index < text.length) {
-      var ch = text.charAt(index);
-      if (ch == '.' || isIdent(ch) || isNumber(ch)) {
-        ident += ch;
-      } else {
-        break;
+    consume(/^[\w_\$][\w_\$\d]*(\.[\w_\$][\w_\$\d]*)*/, function(token, ident){
+      fn = OPERATORS[ident];
+      if (!fn) {
+        fn = getterFn(ident);
+        fn.isAssignable = ident;
      }
-      index++;
-    }
-    fn = OPERATORS[ident];
-    tokens.push({
-      index:start, 
-      text:ident, 
-      json: fn,
-      fn:fn||extend(getterFn(ident), {
+      token.fn = OPERATORS[ident]||extend(getterFn(ident), {
        assign:function(self, value){
          return setter(self, ident, value);
        }
-      })
+      });
+      token.json = OPERATORS[ident];
    });
  }
  
  function readString(quote) {
-    var start = index;
-    index++;
-    var string = "";
-    var rawString = quote;
-    var escape = false;
-    while (index < text.length) {
-      var ch = text.charAt(index);
-      rawString += ch;
-      if (escape) {
-        if (ch == 'u') {
-          var hex = text.substring(index + 1, index + 5);
-          if (!hex.match(/[\da-f]{4}/i))
-            throwError( "Invalid unicode escape [\\u" + hex + "]");
-          index += 4;
-          string += String.fromCharCode(parseInt(hex, 16));
-        } else {
-          var rep = ESCAPE[ch];
-          if (rep) {
-            string += rep;
-          } else {
-            string += ch;
-          }
-        }
-        escape = false;
-      } else if (ch == '\\') {
-        escape = true;
-      } else if (ch == quote) {
-        index++;
-        tokens.push({index:start, text:rawString, string:string, json:true,
-          fn:function(){
-            return (string.length == dateParseLength) ?
-              angular['String']['toDate'](string) : string;
-          }});
-        return;
-      } else {
-        string += ch;
-      }
-      index++;
-    }
-    throwError("Unterminated quote", start);
+    consume(/^(('(\\'|[^'])*')|("(\\"|[^"])*"))/, function(token, rawString, start){
+      var hasError;
+      var string = token.string = rawString.substr(1, rawString.length - 2).
+        replace(/(\\u(.?.?.?.?))|(\\(.))/g, 
+          function(match, wholeUnicode, unicode, wholeEscape, escape){
+            if (unicode && !unicode.match(/[\da-fA-F]{4}/))
+              hasError = hasError || bind(null, throwError, "Invalid unicode escape [\\u" + unicode + "]", start);
+            return unicode ? 
+                String.fromCharCode(parseInt(unicode, 16)) : 
+                ESCAPE[escape] || escape;
+          });
+      (hasError||noop)();
+      token.json = true;
+      token.fn = function(){
+        return (string.length == dateParseLength) ?
+            angular['String']['toDate'](string) : 
+            string;
+      };
+    }, "Unterminated string");
  }
 }

--- a/test/ParserSpec.js
+++ b/test/ParserSpec.js
@@ -82,9 +82,15 @@ describe('parser', function() {
      expect(tokens.length).toEqual(1);
      expect(tokens[0].string).toEqual('\u00a0');
    });
+    
+    it('should error when non terminated string', function(){
+      expect(function(){
+        lex('ignore "text');
+      }).toThrow(new Error('Lexer Error: Unterminated string at column 7 in expression [ignore "text].'));
+    });

    it('should ignore whitespace', function() {
-      var tokens = lex("a \t \n \r b");
+      var tokens = lex("a \t \n \r \u00A0 b");
      expect(tokens[0].text).toEqual('a');
      expect(tokens[1].text).toEqual('b');
    });
@@ -130,16 +136,6 @@ describe('parser', function() {
      expect(tokens[0].text).toEqual(0.5E+10);
    });

-    it('should throws exception for invalid exponent', function() {
-      expect(function() {
-        lex("0.5E-");
-      }).toThrow(new Error('Lexer Error: Invalid exponent at column 4 in expression [0.5E-].'));
-      
-      expect(function() {
-        lex("0.5E-A");
-      }).toThrow(new Error('Lexer Error: Invalid exponent at column 4 in expression [0.5E-A].'));
-    });
-
    it('should tokenize number starting with a dot', function() {
      var tokens = lex(".5");
      expect(tokens[0].text).toEqual(0.5);
@@ -147,8 +143,8 @@ describe('parser', function() {

    it('should throw error on invalid unicode', function() {
      expect(function() {
-        lex("'\\u1''bla'");
-      }).toThrow(new Error("Lexer Error: Invalid unicode escape [\\u1''b] at column 2 in expression ['\\u1''bla']."));
+        lex("'\\u1xbla'");
+      }).toThrow(new Error("Lexer Error: Invalid unicode escape [\\u1xbl] at columns 0-9 ['\\u1xbla'] in expression ['\\u1xbla']."));
    });
  });