Implement proper comment tokenzing

CSS style comments (`/* */`) are allow interpolants (`#{$foo}`), and disallow everything else. In order to dodge a bunch of edge cases I've implemented a sub-tokenizer for comments that makes all non-interpolant or whitespace tokens as word tokens.
sasstools · Jan 28, 2017 · 7969c0d · 7969c0d
1 parent a87c091
commit 7969c0d
Show file tree

Hide file tree

Showing 7 changed files with 246 additions and 10 deletions.
diff --git a/lib/tokenize-comment.js b/lib/tokenize-comment.js
@@ -0,0 +1,135 @@
+import Input from './input';
+import tokenizeString from './tokenize-string';
+import tokenizeInterpolant from './tokenize-interpolant';
+
+let newline    = '\n'.charCodeAt(0),
+    space      = ' '.charCodeAt(0),
+    feed       = '\f'.charCodeAt(0),
+    tab        = '\t'.charCodeAt(0),
+    cr         = '\r'.charCodeAt(0),
+    hash       = '#'.charCodeAt(0),
+    backslash  = '\\'.charCodeAt(0),
+    slash      = '/'.charCodeAt(0),
+    openCurly  = '{'.charCodeAt(0),
+    closeCurly = '}'.charCodeAt(0),
+    asterisk   = '*'.charCodeAt(0),
+    wordEnd    = /[ \n\t\r\(\)\{\},:;@!'"\\]|\/(?=\*)|#(?={)/g;
+
+export default function tokenize(input, l, p) {
+    let tokens = [];
+    let css    = input.css.valueOf();
+
+    let code, next, lines, last, content, escape,
+        nextLine, nextOffset, escaped, escapePos,
+        inInterpolant, inComment, inString;
+
+    let length = css.length;
+    let offset = -1;
+    let line   =  l || 1;
+    let pos    =  p || 0;
+
+    loop:
+    while ( pos < length ) {
+        code = css.charCodeAt(pos);
+
+        if ( code === newline ) {
+            offset = pos;
+            line  += 1;
+        }
+
+        switch ( code ) {
+            case space:
+            case tab:
+            case cr:
+            case feed:
+                next = pos;
+                do {
+                    next += 1;
+                    code = css.charCodeAt(next);
+                    if ( code === newline ) {
+                        offset = next;
+                        line  += 1;
+                    }
+                } while ( code === space   ||
+                          code === tab     ||
+                          code === cr      ||
+                          code === feed );
+
+                tokens.push(['space', css.slice(pos, next)]);
+                pos = next - 1;
+                break;
+
+            case newline:
+                tokens.push(['newline', '\n', line, pos - offset]);
+                break;
+
+            case closeCurly:
+                tokens.push(['endInterpolant', '}', line, pos - offset]);
+                break;
+
+            case backslash:
+                next   = pos;
+                escape = true;
+                while ( css.charCodeAt(next + 1) === backslash ) {
+                    next  += 1;
+                    escape = !escape;
+                }
+                code = css.charCodeAt(next + 1);
+                if ( escape && (code !== slash   &&
+                                code !== space   &&
+                                code !== newline &&
+                                code !== tab     &&
+                                code !== cr      &&
+                                code !== feed ) ) {
+                    next += 1;
+                }
+                tokens.push(['word', css.slice(pos, next + 1),
+                    line, pos  - offset,
+                    line, next - offset
+                ]);
+                pos = next;
+                break;
+
+            default:
+
+                if ( code === asterisk && css.charCodeAt(pos + 1) === slash ) {
+                    next = pos;
+                    pos = next - 1;
+                    break loop;
+                }
+
+                if ( code === hash && css.charCodeAt(pos + 1) === openCurly ) {
+                    tokens.push(['startInterpolant', '#{', line, pos + 1 - offset]);
+                    next = pos + 1;
+
+                    let { tokens: t, pos: p } = tokenizeInterpolant(input, line, next + 1);
+                    tokens = tokens.concat(t);
+                    next = p;
+
+                    pos = next;
+                    break;
+                }
+
+                wordEnd.lastIndex = pos + 1;
+                wordEnd.test(css);
+                if ( wordEnd.lastIndex === 0 ) {
+                    next = css.length - 1;
+                } else {
+                    next = wordEnd.lastIndex - 2;
+                }
+
+                tokens.push(['word', css.slice(pos, next + 1),
+                    line, pos  - offset,
+                    line, next - offset
+                ]);
+
+                pos = next;
+
+                break;
+        }
+
+        pos++;
+    }
+
+    return { tokens, line, pos, offset };
+}
diff --git a/lib/tokenize-interpolant.js b/lib/tokenize-interpolant.js
@@ -1,5 +1,6 @@
 import Input from './input';
 import tokenizeString from './tokenize-string';
+import tokenizeComment from './tokenize-comment';
 import tokenizeInterpolant from './tokenize-interpolant';
 
 let singleQuote  = "'".charCodeAt(0),
@@ -88,10 +89,6 @@ export default function tokenize(input, l, p) {
                 tokens.push(['-', '-', line, pos - offset]);
                 break;
 
-            case asterisk:
-                tokens.push(['*', '*', line, pos - offset]);
-                break;
-
             case decComb:
                 tokens.push(['>', '>', line, pos - offset]);
                 break;
@@ -184,7 +181,20 @@ export default function tokenize(input, l, p) {
                 if ( code === slash && css.charCodeAt(pos + 1) === asterisk ) {
                     inComment = true;
                     tokens.push(['startComment', '/*', line, pos + 1 - offset]);
-                    pos += 2;
+                    next = pos + 1;
+
+                    let { tokens: t, line: l, pos: p, offset: o } = tokenizeComment(input, line, next + 1);
+                    tokens = tokens.concat(t);
+                    next = p;
+                    line = l;
+                    offset = o;
+
+                    pos = next;
+                    break;
+                }
+
+                if ( code === asterisk && css.charCodeAt(pos + 1) !== slash) {
+                    tokens.push(['*', '*', line, pos - offset]);
                     break;
                 }
 

diff --git a/lib/tokenize.js b/lib/tokenize.js
@@ -1,5 +1,6 @@
 import Input from './input';
 import tokenizeString from './tokenize-string';
+import tokenizeComment from './tokenize-comment';
 import tokenizeInterpolant from './tokenize-interpolant';
 
 let singleQuote  = "'".charCodeAt(0),
@@ -87,10 +88,6 @@ export default function tokenize(input, l, p) {
                 tokens.push(['-', '-', line, pos - offset]);
                 break;
 
-            case asterisk:
-                tokens.push(['*', '*', line, pos - offset]);
-                break;
-
             case decComb:
                 tokens.push(['>', '>', line, pos - offset]);
                 break;
@@ -183,7 +180,20 @@ export default function tokenize(input, l, p) {
                 if ( code === slash && css.charCodeAt(pos + 1) === asterisk ) {
                     inComment = true;
                     tokens.push(['startComment', '/*', line, pos + 1 - offset]);
-                    pos += 2;
+                    next = pos + 1;
+
+                    let { tokens: t, line: l, pos: p, offset: o } = tokenizeComment(input, line, next + 1);
+                    tokens = tokens.concat(t);
+                    next = p;
+                    line = l;
+                    offset = o;
+
+                    pos = next;
+                    break;
+                }
+
+                if ( code === asterisk && css.charCodeAt(pos + 1) !== slash) {
+                    tokens.push(['*', '*', line, pos - offset]);
                     break;
                 }
 

diff --git a/test/comment.js b/test/comment.js
@@ -0,0 +1,72 @@
+var scss = require('..');
+var fs = require('fs');
+var path = require('path');
+var assert = require('chai').assert;
+
+var fixture = function(name) {
+    return fs.readFileSync(
+        path.join(__dirname, 'fixture', name)
+    );
+}
+
+describe('Comment', function() {
+    it('should tokenize a simple comment', function() {
+        assert.deepEqual(
+            [
+                ['startComment', '/*', 1, 2],
+                ['space', ' '],
+                ['word', 'my', 1, 4, 1, 5],
+                ['space', ' '],
+                ['word', 'comment', 1, 7, 1, 13],
+                ['space', ' '],
+                ['endComment', '*/', 1, 16],
+            ],
+            scss.tokenize(fixture('simple-comment.scss'))
+        );
+    });
+
+    it('should tokenize a multiline comment', function() {
+        assert.deepEqual(
+            [
+                ['startComment', '/*', 1, 2],
+                ['newline', '\n', 2, 0],
+                ['word', 'my', 2, 1, 2, 2],
+                ['space', ' '],
+                ['word', 'comment', 2, 4, 2, 10],
+                ['newline', '\n', 3, 0],
+                ['endComment', '*/', 3, 2],
+            ],
+            scss.tokenize(fixture('multiline-comment.scss'))
+        );
+    });
+
+    it('should tokenize a docblock comment', function() {
+        assert.deepEqual(
+            [
+                ['startComment', '/*', 1, 2],
+                ['word', '*', 1, 3, 1, 3],
+                ['newline', '\n', 2, 0],
+                ['space', ' '],
+                ['word', '*', 2, 2, 2, 2],
+                ['space', ' '],
+                ['word', 'line', 2, 4, 2, 7],
+                ['space', ' '],
+                ['word', '1', 2, 9, 2, 9],
+                ['newline', '\n', 3, 0],
+                ['space', ' '],
+                ['word', '*', 3, 2, 3, 2],
+                ['newline', '\n', 4, 0],
+                ['space', ' '],
+                ['word', '*', 4, 2, 4, 2],
+                ['space', ' '],
+                ['word', 'line', 4, 4, 4, 7],
+                ['space', ' '],
+                ['word', '2', 4, 9, 4, 9],
+                ['newline', '\n', 5, 0],
+                ['space', ' '],
+                ['endComment', '*/', 5, 3],
+            ],
+            scss.tokenize(fixture('docblock-comment.scss'))
+        );
+    });
+});
diff --git a/test/fixture/docblock-comment.scss b/test/fixture/docblock-comment.scss
@@ -0,0 +1,5 @@
+/**
+ * line 1
+ *
+ * line 2
+ */
diff --git a/test/fixture/multiline-comment.scss b/test/fixture/multiline-comment.scss
@@ -0,0 +1,3 @@
+/*
+my comment
+*/
diff --git a/test/fixture/simple-comment.scss b/test/fixture/simple-comment.scss
@@ -0,0 +1 @@
+/* my comment */