Refactor lexer to use regular expressions

This commit is contained in:
Misko Hevery
2010-12-07 11:42:34 -08:00
parent e5e69d9b90
commit 23fc73081f
2 changed files with 56 additions and 103 deletions

View File

@@ -32,7 +32,7 @@ function lex(text, parseStringsForObjects){
index = 0,
json = [],
ch,
lastCh = ':'; // can start regexp
lastCh = ':';
while (index < text.length) {
ch = text.charAt(index);
@@ -71,6 +71,9 @@ function lex(text, parseStringsForObjects){
lastCh = ch;
}
return tokens;
//////////////////////////////////////////////
function is(chars) {
return chars.indexOf(ch) != -1;
@@ -95,10 +98,6 @@ function lex(text, parseStringsForObjects){
'A' <= ch && ch <= 'Z' ||
'_' == ch || ch == '$';
}
function isExpOperator(ch) {
return ch == '-' || ch == '+' || isNumber(ch);
}
function throwError(error, start, end) {
end = end || index;
throw Error("Lexer Error: " + error + " at column" +
@@ -107,103 +106,61 @@ function lex(text, parseStringsForObjects){
" " + end) +
" in expression [" + text + "].");
}
function consume(regexp, processToken, errorMsg) {
var match = text.substr(index).match(regexp);
var token = {index: index};
var start = index;
if (!match) throwError(errorMsg);
index += match[0].length;
processToken(token, token.text = match[0], start);
tokens.push(token);
}
function readNumber() {
var number = "";
var start = index;
while (index < text.length) {
var ch = lowercase(text.charAt(index));
if (ch == '.' || isNumber(ch)) {
number += ch;
} else {
var peekCh = peek();
if (ch == 'e' && isExpOperator(peekCh)) {
number += ch;
} else if (isExpOperator(ch) &&
peekCh && isNumber(peekCh) &&
number.charAt(number.length - 1) == 'e') {
number += ch;
} else if (isExpOperator(ch) &&
(!peekCh || !isNumber(peekCh)) &&
number.charAt(number.length - 1) == 'e') {
throwError('Invalid exponent');
} else {
break;
}
}
index++;
}
number = 1 * number;
tokens.push({index:start, text:number, json:true,
fn:function(){return number;}});
consume(/^(\d+)?(\.\d+)?([eE][+-]?\d+)?/, function(token, number){
token.text = number = 1 * number;
token.json = true;
token.fn = valueFn(number);
}, "Not a valid number");
}
function readIdent() {
var ident = "";
var start = index;
var fn;
while (index < text.length) {
var ch = text.charAt(index);
if (ch == '.' || isIdent(ch) || isNumber(ch)) {
ident += ch;
} else {
break;
consume(/^[\w_\$][\w_\$\d]*(\.[\w_\$][\w_\$\d]*)*/, function(token, ident){
fn = OPERATORS[ident];
if (!fn) {
fn = getterFn(ident);
fn.isAssignable = ident;
}
index++;
}
fn = OPERATORS[ident];
tokens.push({
index:start,
text:ident,
json: fn,
fn:fn||extend(getterFn(ident), {
token.fn = OPERATORS[ident]||extend(getterFn(ident), {
assign:function(self, value){
return setter(self, ident, value);
}
})
});
token.json = OPERATORS[ident];
});
}
function readString(quote) {
var start = index;
index++;
var string = "";
var rawString = quote;
var escape = false;
while (index < text.length) {
var ch = text.charAt(index);
rawString += ch;
if (escape) {
if (ch == 'u') {
var hex = text.substring(index + 1, index + 5);
if (!hex.match(/[\da-f]{4}/i))
throwError( "Invalid unicode escape [\\u" + hex + "]");
index += 4;
string += String.fromCharCode(parseInt(hex, 16));
} else {
var rep = ESCAPE[ch];
if (rep) {
string += rep;
} else {
string += ch;
}
}
escape = false;
} else if (ch == '\\') {
escape = true;
} else if (ch == quote) {
index++;
tokens.push({index:start, text:rawString, string:string, json:true,
fn:function(){
return (string.length == dateParseLength) ?
angular['String']['toDate'](string) : string;
}});
return;
} else {
string += ch;
}
index++;
}
throwError("Unterminated quote", start);
consume(/^(('(\\'|[^'])*')|("(\\"|[^"])*"))/, function(token, rawString, start){
var hasError;
var string = token.string = rawString.substr(1, rawString.length - 2).
replace(/(\\u(.?.?.?.?))|(\\(.))/g,
function(match, wholeUnicode, unicode, wholeEscape, escape){
if (unicode && !unicode.match(/[\da-fA-F]{4}/))
hasError = hasError || bind(null, throwError, "Invalid unicode escape [\\u" + unicode + "]", start);
return unicode ?
String.fromCharCode(parseInt(unicode, 16)) :
ESCAPE[escape] || escape;
});
(hasError||noop)();
token.json = true;
token.fn = function(){
return (string.length == dateParseLength) ?
angular['String']['toDate'](string) :
string;
};
}, "Unterminated string");
}
}

View File

@@ -82,9 +82,15 @@ describe('parser', function() {
expect(tokens.length).toEqual(1);
expect(tokens[0].string).toEqual('\u00a0');
});
it('should error when non terminated string', function(){
expect(function(){
lex('ignore "text');
}).toThrow(new Error('Lexer Error: Unterminated string at column 7 in expression [ignore "text].'));
});
it('should ignore whitespace', function() {
var tokens = lex("a \t \n \r b");
var tokens = lex("a \t \n \r \u00A0 b");
expect(tokens[0].text).toEqual('a');
expect(tokens[1].text).toEqual('b');
});
@@ -130,16 +136,6 @@ describe('parser', function() {
expect(tokens[0].text).toEqual(0.5E+10);
});
it('should throws exception for invalid exponent', function() {
expect(function() {
lex("0.5E-");
}).toThrow(new Error('Lexer Error: Invalid exponent at column 4 in expression [0.5E-].'));
expect(function() {
lex("0.5E-A");
}).toThrow(new Error('Lexer Error: Invalid exponent at column 4 in expression [0.5E-A].'));
});
it('should tokenize number starting with a dot', function() {
var tokens = lex(".5");
expect(tokens[0].text).toEqual(0.5);
@@ -147,8 +143,8 @@ describe('parser', function() {
it('should throw error on invalid unicode', function() {
expect(function() {
lex("'\\u1''bla'");
}).toThrow(new Error("Lexer Error: Invalid unicode escape [\\u1''b] at column 2 in expression ['\\u1''bla']."));
lex("'\\u1xbla'");
}).toThrow(new Error("Lexer Error: Invalid unicode escape [\\u1xbl] at columns 0-9 ['\\u1xbla'] in expression ['\\u1xbla']."));
});
});