Files
esbuild/internal/css_lexer/css_lexer.go
2021-02-10 04:34:48 -08:00

856 lines
18 KiB
Go

package css_lexer
import (
"strings"
"unicode/utf8"
"github.com/evanw/esbuild/internal/logger"
)
// The lexer converts a source file to a stream of tokens. Unlike esbuild's
// JavaScript lexer, this CSS lexer runs to completion before the CSS parser
// begins, resulting in a single array of all tokens in the file.
type T uint8
const eof = -1
const replacementCharacter = 0xFFFD
const (
TEndOfFile T = iota
TAtKeyword
TBadString
TBadURL
TCDC // "-->"
TCDO // "<!--"
TCloseBrace
TCloseBracket
TCloseParen
TColon
TComma
TDelim
TDelimAmpersand
TDelimAsterisk
TDelimBar
TDelimCaret
TDelimDollar
TDelimDot
TDelimEquals
TDelimExclamation
TDelimGreaterThan
TDelimPlus
TDelimSlash
TDelimTilde
TDimension
TFunction
THash
TIdent
TNumber
TOpenBrace
TOpenBracket
TOpenParen
TPercentage
TSemicolon
TString
TURL
TWhitespace
)
var tokenToString = []string{
"end of file",
"@-keyword",
"bad string token",
"bad URL token",
"\"-->\"",
"\"<!--\"",
"\"}\"",
"\"]\"",
"\")\"",
"\":\"",
"\",\"",
"delimiter",
"\"&\"",
"\"*\"",
"\"|\"",
"\"^\"",
"\"$\"",
"\".\"",
"\"=\"",
"\"!\"",
"\">\"",
"\"+\"",
"\"/\"",
"\"~\"",
"dimension",
"function token",
"hash token",
"identifier",
"number",
"\"{\"",
"\"[\"",
"\"(\"",
"percentage",
"\";\"",
"string token",
"URL token",
"whitespace",
}
func (t T) String() string {
return tokenToString[t]
}
// This token struct is designed to be memory-efficient. It just references a
// range in the input file instead of directly containing the substring of text
// since a range takes up less memory than a string.
type Token struct {
Range logger.Range // 8 bytes
UnitOffset uint16 // 2 bytes
Kind T // 1 byte
IsID bool // 1 byte
}
func (token Token) DecodedText(contents string) string {
raw := contents[token.Range.Loc.Start:token.Range.End()]
switch token.Kind {
case TIdent, TDimension:
return decodeEscapesInToken(raw)
case TAtKeyword, THash:
return decodeEscapesInToken(raw[1:])
case TFunction:
return decodeEscapesInToken(raw[:len(raw)-1])
case TString:
return decodeEscapesInToken(raw[1 : len(raw)-1])
case TURL:
start := 4
end := len(raw) - 1
// Trim leading and trailing whitespace
for start < end && isWhitespace(rune(raw[start])) {
start++
}
for start < end && isWhitespace(rune(raw[end-1])) {
end--
}
return decodeEscapesInToken(raw[start:end])
}
return raw
}
type lexer struct {
log logger.Log
source logger.Source
current int
codePoint rune
Token Token
}
func Tokenize(log logger.Log, source logger.Source) (tokens []Token) {
lexer := lexer{
log: log,
source: source,
}
lexer.step()
// The U+FEFF character is usually a zero-width non-breaking space. However,
// when it's used at the start of a text stream it is called a BOM (byte order
// mark) instead and indicates that the text stream is UTF-8 encoded. This is
// problematic for us because CSS does not treat U+FEFF as whitespace. Only
// " \t\r\n\f" characters are treated as whitespace. Skip over the BOM if it
// is present so it doesn't cause us trouble when we try to parse it.
if lexer.codePoint == '\uFEFF' {
lexer.step()
}
lexer.next()
for lexer.Token.Kind != TEndOfFile {
tokens = append(tokens, lexer.Token)
lexer.next()
}
return
}
func (lexer *lexer) step() {
codePoint, width := utf8.DecodeRuneInString(lexer.source.Contents[lexer.current:])
// Use -1 to indicate the end of the file
if width == 0 {
codePoint = eof
}
lexer.codePoint = codePoint
lexer.Token.Range.Len = int32(lexer.current) - lexer.Token.Range.Loc.Start
lexer.current += width
}
func (lexer *lexer) next() {
// Reference: https://www.w3.org/TR/css-syntax-3/
for {
lexer.Token = Token{Range: logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}}}
switch lexer.codePoint {
case eof:
lexer.Token.Kind = TEndOfFile
case '/':
lexer.step()
switch lexer.codePoint {
case '*':
lexer.step()
lexer.consumeToEndOfMultiLineComment(lexer.Token.Range)
continue
case '/':
lexer.step()
lexer.consumeToEndOfSingleLineComment()
continue
}
lexer.Token.Kind = TDelimSlash
case ' ', '\t', '\n', '\r', '\f':
lexer.step()
for {
if isWhitespace(lexer.codePoint) {
lexer.step()
} else if lexer.codePoint == '/' && lexer.current < len(lexer.source.Contents) && lexer.source.Contents[lexer.current] == '*' {
startRange := logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}, Len: 2}
lexer.step()
lexer.step()
lexer.consumeToEndOfMultiLineComment(startRange)
} else {
break
}
}
lexer.Token.Kind = TWhitespace
case '"', '\'':
lexer.Token.Kind = lexer.consumeString()
case '#':
lexer.step()
if IsNameContinue(lexer.codePoint) || lexer.isValidEscape() {
lexer.Token.Kind = THash
if lexer.wouldStartIdentifier() {
lexer.Token.IsID = true
}
lexer.consumeName()
} else {
lexer.Token.Kind = TDelim
}
case '(':
lexer.step()
lexer.Token.Kind = TOpenParen
case ')':
lexer.step()
lexer.Token.Kind = TCloseParen
case '[':
lexer.step()
lexer.Token.Kind = TOpenBracket
case ']':
lexer.step()
lexer.Token.Kind = TCloseBracket
case '{':
lexer.step()
lexer.Token.Kind = TOpenBrace
case '}':
lexer.step()
lexer.Token.Kind = TCloseBrace
case ',':
lexer.step()
lexer.Token.Kind = TComma
case ':':
lexer.step()
lexer.Token.Kind = TColon
case ';':
lexer.step()
lexer.Token.Kind = TSemicolon
case '+':
if lexer.wouldStartNumber() {
lexer.Token.Kind = lexer.consumeNumeric()
} else {
lexer.step()
lexer.Token.Kind = TDelimPlus
}
case '.':
if lexer.wouldStartNumber() {
lexer.Token.Kind = lexer.consumeNumeric()
} else {
lexer.step()
lexer.Token.Kind = TDelimDot
}
case '-':
if lexer.wouldStartNumber() {
lexer.Token.Kind = lexer.consumeNumeric()
} else if lexer.current+2 <= len(lexer.source.Contents) && lexer.source.Contents[lexer.current:lexer.current+2] == "->" {
lexer.step()
lexer.step()
lexer.step()
lexer.Token.Kind = TCDC
} else if lexer.wouldStartIdentifier() {
lexer.consumeName()
lexer.Token.Kind = TIdent
} else {
lexer.step()
lexer.Token.Kind = TDelim
}
case '<':
if lexer.current+3 <= len(lexer.source.Contents) && lexer.source.Contents[lexer.current:lexer.current+3] == "!--" {
lexer.step()
lexer.step()
lexer.step()
lexer.step()
lexer.Token.Kind = TCDO
} else {
lexer.step()
lexer.Token.Kind = TDelim
}
case '@':
lexer.step()
if lexer.wouldStartIdentifier() {
lexer.consumeName()
lexer.Token.Kind = TAtKeyword
} else {
lexer.Token.Kind = TDelim
}
case '\\':
if lexer.isValidEscape() {
lexer.Token.Kind = lexer.consumeIdentLike()
} else {
lexer.step()
lexer.log.AddRangeError(&lexer.source, lexer.Token.Range, "Invalid escape")
lexer.Token.Kind = TDelim
}
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
lexer.Token.Kind = lexer.consumeNumeric()
case '>':
lexer.step()
lexer.Token.Kind = TDelimGreaterThan
case '~':
lexer.step()
lexer.Token.Kind = TDelimTilde
case '&':
lexer.step()
lexer.Token.Kind = TDelimAmpersand
case '*':
lexer.step()
lexer.Token.Kind = TDelimAsterisk
case '|':
lexer.step()
lexer.Token.Kind = TDelimBar
case '!':
lexer.step()
lexer.Token.Kind = TDelimExclamation
case '=':
lexer.step()
lexer.Token.Kind = TDelimEquals
case '^':
lexer.step()
lexer.Token.Kind = TDelimCaret
case '$':
lexer.step()
lexer.Token.Kind = TDelimDollar
default:
if IsNameStart(lexer.codePoint) {
lexer.Token.Kind = lexer.consumeIdentLike()
} else {
lexer.step()
lexer.Token.Kind = TDelim
}
}
return
}
}
func (lexer *lexer) consumeToEndOfMultiLineComment(startRange logger.Range) {
for {
switch lexer.codePoint {
case '*':
lexer.step()
if lexer.codePoint == '/' {
lexer.step()
return
}
case eof: // This indicates the end of the file
lexer.log.AddErrorWithNotes(&lexer.source, logger.Loc{Start: lexer.Token.Range.End()}, "Expected \"*/\" to terminate multi-line comment",
[]logger.MsgData{logger.RangeData(&lexer.source, startRange, "The multi-line comment starts here")})
return
default:
lexer.step()
}
}
}
func (lexer *lexer) consumeToEndOfSingleLineComment() {
for !isNewline(lexer.codePoint) && lexer.codePoint != eof {
lexer.step()
}
lexer.log.AddRangeWarning(&lexer.source, lexer.Token.Range, "Comments in CSS use \"/* ... */\" instead of \"//\"")
}
func (lexer *lexer) isValidEscape() bool {
if lexer.codePoint != '\\' {
return false
}
c, _ := utf8.DecodeRuneInString(lexer.source.Contents[lexer.current:])
return !isNewline(c)
}
func (lexer *lexer) wouldStartIdentifier() bool {
if IsNameStart(lexer.codePoint) {
return true
}
if lexer.codePoint == '-' {
c, w := utf8.DecodeRuneInString(lexer.source.Contents[lexer.current:])
if IsNameStart(c) || c == '-' {
return true
}
if c == '\\' {
c, _ = utf8.DecodeRuneInString(lexer.source.Contents[lexer.current+w:])
return !isNewline(c)
}
return false
}
return lexer.isValidEscape()
}
func WouldStartIdentifierWithoutEscapes(text string) bool {
if len(text) > 0 {
c, width := utf8.DecodeRuneInString(text)
if IsNameStart(c) {
return true
} else if c == '-' {
if c, _ := utf8.DecodeRuneInString(text[width:]); IsNameStart(c) || c == '-' {
return true
}
}
}
return false
}
func (lexer *lexer) wouldStartNumber() bool {
if lexer.codePoint >= '0' && lexer.codePoint <= '9' {
return true
} else if lexer.codePoint == '.' {
contents := lexer.source.Contents
if lexer.current < len(contents) {
c := contents[lexer.current]
return c >= '0' && c <= '9'
}
} else if lexer.codePoint == '+' || lexer.codePoint == '-' {
contents := lexer.source.Contents
n := len(contents)
if lexer.current < n {
c := contents[lexer.current]
if c >= '0' && c <= '9' {
return true
}
if c == '.' && lexer.current+1 < n {
c = contents[lexer.current+1]
return c >= '0' && c <= '9'
}
}
}
return false
}
func (lexer *lexer) consumeName() string {
// Common case: no escapes, identifier is a substring of the input
for IsNameContinue(lexer.codePoint) {
lexer.step()
}
raw := lexer.source.Contents[lexer.Token.Range.Loc.Start:lexer.Token.Range.End()]
if !lexer.isValidEscape() {
return raw
}
// Uncommon case: escapes, identifier is allocated
sb := strings.Builder{}
sb.WriteString(raw)
sb.WriteRune(lexer.consumeEscape())
for {
if IsNameContinue(lexer.codePoint) {
sb.WriteRune(lexer.codePoint)
lexer.step()
} else if lexer.isValidEscape() {
sb.WriteRune(lexer.consumeEscape())
} else {
break
}
}
return sb.String()
}
func (lexer *lexer) consumeEscape() rune {
lexer.step() // Skip the backslash
c := lexer.codePoint
if hex, ok := isHex(c); ok {
lexer.step()
for i := 0; i < 5; i++ {
if next, ok := isHex(lexer.codePoint); ok {
lexer.step()
hex = hex*16 + next
} else {
break
}
}
if isWhitespace(lexer.codePoint) {
lexer.step()
}
if hex == 0 || (hex >= 0xD800 && hex <= 0xDFFF) || hex > 0x10FFFF {
return replacementCharacter
}
return rune(hex)
}
if c == eof {
return replacementCharacter
}
lexer.step()
return c
}
func (lexer *lexer) consumeIdentLike() T {
name := lexer.consumeName()
if lexer.codePoint == '(' {
lexer.step()
if len(name) == 3 {
u, r, l := name[0], name[1], name[2]
if (u == 'u' || u == 'U') && (r == 'r' || r == 'R') && (l == 'l' || l == 'L') {
for isWhitespace(lexer.codePoint) {
lexer.step()
}
if lexer.codePoint != '"' && lexer.codePoint != '\'' {
return lexer.consumeURL()
}
}
}
return TFunction
}
return TIdent
}
func (lexer *lexer) consumeURL() T {
validURL:
for {
switch lexer.codePoint {
case ')':
lexer.step()
return TURL
case eof:
loc := logger.Loc{Start: lexer.Token.Range.End()}
lexer.log.AddError(&lexer.source, loc, "Expected \")\" to end URL token")
return TBadURL
case ' ', '\t', '\n', '\r', '\f':
lexer.step()
for isWhitespace(lexer.codePoint) {
lexer.step()
}
if lexer.codePoint != ')' {
loc := logger.Loc{Start: lexer.Token.Range.End()}
lexer.log.AddError(&lexer.source, loc, "Expected \")\" to end URL token")
break validURL
}
lexer.step()
return TURL
case '"', '\'', '(':
r := logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}, Len: 1}
lexer.log.AddRangeError(&lexer.source, r, "Expected \")\" to end URL token")
break validURL
case '\\':
if !lexer.isValidEscape() {
r := logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}, Len: 1}
lexer.log.AddRangeError(&lexer.source, r, "Invalid escape")
break validURL
}
lexer.consumeEscape()
default:
if isNonPrintable(lexer.codePoint) {
r := logger.Range{Loc: logger.Loc{Start: lexer.Token.Range.End()}, Len: 1}
lexer.log.AddRangeError(&lexer.source, r, "Unexpected non-printable character in URL token")
}
lexer.step()
}
}
// Consume the remnants of a bad url
for {
switch lexer.codePoint {
case ')', eof:
lexer.step()
return TBadURL
case '\\':
if lexer.isValidEscape() {
lexer.consumeEscape()
}
}
lexer.step()
}
}
func (lexer *lexer) consumeString() T {
quote := lexer.codePoint
lexer.step()
for {
switch lexer.codePoint {
case '\\':
lexer.step()
// Handle Windows CRLF
if lexer.codePoint == '\r' {
lexer.step()
if lexer.codePoint == '\n' {
lexer.step()
}
continue
}
// Otherwise, fall through to ignore the character after the backslash
case eof:
lexer.log.AddError(&lexer.source, logger.Loc{Start: lexer.Token.Range.End()}, "Unterminated string token")
return TBadString
case '\n', '\r', '\f':
lexer.log.AddError(&lexer.source, logger.Loc{Start: lexer.Token.Range.End()}, "Unterminated string token")
return TBadString
case quote:
lexer.step()
return TString
}
lexer.step()
}
}
func (lexer *lexer) consumeNumeric() T {
// Skip over leading sign
if lexer.codePoint == '+' || lexer.codePoint == '-' {
lexer.step()
}
// Skip over leading digits
for lexer.codePoint >= '0' && lexer.codePoint <= '9' {
lexer.step()
}
// Skip over digits after dot
if lexer.codePoint == '.' {
lexer.step()
for lexer.codePoint >= '0' && lexer.codePoint <= '9' {
lexer.step()
}
}
// Skip over exponent
if lexer.codePoint == 'e' || lexer.codePoint == 'E' {
contents := lexer.source.Contents
// Look ahead before advancing to make sure this is an exponent, not a unit
if lexer.current < len(contents) {
c := contents[lexer.current]
if (c == '+' || c == '-') && lexer.current+1 < len(contents) {
c = contents[lexer.current+1]
}
// Only consume this if it's an exponent
if c >= '0' && c <= '9' {
lexer.step()
if lexer.codePoint == '+' || lexer.codePoint == '-' {
lexer.step()
}
for lexer.codePoint >= '0' && lexer.codePoint <= '9' {
lexer.step()
}
}
}
}
// Determine the numeric type
if lexer.wouldStartIdentifier() {
lexer.Token.UnitOffset = uint16(lexer.Token.Range.Len)
lexer.consumeName()
return TDimension
}
if lexer.codePoint == '%' {
lexer.step()
return TPercentage
}
return TNumber
}
func IsNameStart(c rune) bool {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80
}
func IsNameContinue(c rune) bool {
return IsNameStart(c) || (c >= '0' && c <= '9') || c == '-'
}
func isNewline(c rune) bool {
switch c {
case '\n', '\r', '\f':
return true
}
return false
}
func isWhitespace(c rune) bool {
switch c {
case ' ', '\t', '\n', '\r', '\f':
return true
}
return false
}
func isHex(c rune) (int, bool) {
if c >= '0' && c <= '9' {
return int(c - '0'), true
}
if c >= 'a' && c <= 'f' {
return int(c + (10 - 'a')), true
}
if c >= 'A' && c <= 'F' {
return int(c + (10 - 'A')), true
}
return 0, false
}
func isNonPrintable(c rune) bool {
return c <= 0x08 || c == 0x0B || (c >= 0x0E && c <= 0x1F) || c == 0x7F
}
func decodeEscapesInToken(inner string) string {
i := 0
for i < len(inner) {
if inner[i] == '\\' {
break
}
i++
}
if i == len(inner) {
return inner
}
sb := strings.Builder{}
sb.WriteString(inner[:i])
inner = inner[i:]
for len(inner) > 0 {
c, width := utf8.DecodeRuneInString(inner)
inner = inner[width:]
if c != '\\' {
sb.WriteRune(c)
continue
}
if len(inner) == 0 {
sb.WriteRune(replacementCharacter)
continue
}
c, width = utf8.DecodeRuneInString(inner)
inner = inner[width:]
hex, ok := isHex(c)
if !ok {
if c == '\n' || c == '\f' {
continue
}
// Handle Windows CRLF
if c == '\r' {
c, width = utf8.DecodeRuneInString(inner)
if c == '\n' {
inner = inner[width:]
}
continue
}
// If we get here, this is not a valid escape. However, this is still
// allowed. In this case the backslash is just ignored.
sb.WriteRune(c)
continue
}
// Parse up to five additional hex characters (so six in total)
for i := 0; i < 5 && len(inner) > 0; i++ {
c, width = utf8.DecodeRuneInString(inner)
if next, ok := isHex(c); ok {
inner = inner[width:]
hex = hex*16 + next
} else {
break
}
}
if len(inner) > 0 {
c, width = utf8.DecodeRuneInString(inner)
if isWhitespace(c) {
inner = inner[width:]
}
}
if hex == 0 || (hex >= 0xD800 && hex <= 0xDFFF) || hex > 0x10FFFF {
sb.WriteRune(replacementCharacter)
continue
}
sb.WriteRune(rune(hex))
}
return sb.String()
}