1 module djinn.parsing; 2 3 @safe: 4 5 import std.algorithm; 6 import std.array; 7 import std.exception; 8 import std.range; 9 import std..string; 10 import std.utf; 11 12 import djinn.lexing; 13 import djinn.types; 14 15 package: 16 17 /// Turns Djinn code into post-processed tokens (entry point to all Djinn parsing) 18 Token[] getTokens(Source* src) pure 19 { 20 if (src.empty) return Token[].init; 21 auto tokens_app = appender!(Token[]); 22 parseTokens(src, tokens_app); 23 auto tokens = tokens_app[]; 24 return trimWhitespace(tokens, src.line_offsets); 25 } 26 27 private: 28 29 /** 30 Post-processes an array of tokens to implement whitespace-trimming rules 31 32 These rules allow using newlines, etc., to format Djinn code, without affecting the output 33 */ 34 Token[] trimWhitespace(Token[] tokens, const(size_t[]) line_offsets) pure 35 { 36 // Djinn's whitespace rule: we don't render lines that contain at least some [< or {% tokens and nothing else but whitespace 37 // This loop scans each line and checks the tokens intersecting them in total time O(num_lines + num_tokens), trimming whitespace from text tokens if the rule is triggered 38 size_t line_start_token_idx = 0; 39 foreach (j; 0..line_offsets.length-1) 40 { 41 const line_offset = line_offsets[j]; 42 const line_end_offset = line_offsets[j+1]; 43 44 while (tokens[line_start_token_idx+1].pos.offset <= line_offset) 45 { 46 line_start_token_idx++; 47 } 48 49 size_t line_end_token_idx = line_start_token_idx+1; 50 bool has_exprs = tokens[line_start_token_idx].type == TokenType.expressions, is_all_text = tokens[line_start_token_idx].type == TokenType.text; 51 while (tokens[line_end_token_idx].pos.offset < line_end_offset) 52 { 53 has_exprs |= tokens[line_end_token_idx].type == TokenType.expressions; 54 is_all_text &= tokens[line_end_token_idx].type == TokenType.text; 55 line_end_token_idx++; 56 } 57 58 if (has_exprs || is_all_text) 59 { 60 line_start_token_idx = line_end_token_idx - 1; 61 continue; 62 } 63 64 bool rendersWhiteOnLine(ref const(Token) token) 65 { 66 assert (token.type != TokenType.expressions); 67 if (token.type != TokenType.text) return true; 68 import std.uni : isWhite; 69 string value = token.value; 70 if (token.pos.offset < line_offset) value = value[line_offset-token.pos.offset..$]; 71 if (token.pos.offset + token.value.length > line_end_offset) value = value[0..$-(token.pos.offset+token.value.length-line_end_offset)]; 72 return value.all!(isWhite); 73 } 74 if (!tokens[line_start_token_idx..line_end_token_idx].all!rendersWhiteOnLine) continue; 75 76 if (tokens[line_start_token_idx].type == TokenType.text) 77 { 78 const t = tokens[line_start_token_idx]; 79 tokens[line_start_token_idx].value = t.value[0..line_offset-t.pos.offset]; 80 } 81 foreach (k; line_start_token_idx+1..line_end_token_idx-1) if (tokens[k].type == TokenType.text) tokens[k].value = ""; 82 if (tokens[line_end_token_idx-1].type == TokenType.text) 83 { 84 const t = tokens[line_end_token_idx-1]; 85 tokens[line_end_token_idx-1].value = t.value[line_end_offset-t.pos.offset..$]; 86 } 87 88 line_start_token_idx = line_end_token_idx - 1; 89 } 90 91 // This handles the [:- and -:] whitespace-stripping feature, as in Jinja2 92 foreach (j; 0..tokens.length-1) 93 { 94 // FIXME: correct pos 95 if (tokens[j].whitespace_stripping & WhitespaceStripping.right) tokens[j+1].value = stripLeft(tokens[j+1].value); 96 if (tokens[j+1].whitespace_stripping & WhitespaceStripping.left) tokens[j].value = stripRight(tokens[j].value); 97 } 98 return tokens; 99 } 100 101 /// Parses complete Djinn code, outputting tokens 102 void parseTokens(Source* src, ref TokenSink output) pure 103 { 104 while (!src.empty) 105 { 106 foreach (j; 0..src.rem.length-1) 107 { 108 if (src.rem[j] == '[') 109 { 110 auto type = cast(TokenType)src.rem[j+1].among(':', '=', '<'); 111 if (type == TokenType.text) continue; 112 113 if (j > 0) output.put(Token(TokenType.text, src.rem[0..j].source, src.curPos())); 114 src.rem = src.rem[j..$]; 115 const pos = src.curPos(); 116 src.rem = src.rem[2..$]; 117 118 with (TokenType) final switch (type) 119 { 120 case text: 121 assert (false); 122 123 case statements: 124 output.put(parseD(src, type, pos)); 125 break; 126 127 case expressions: 128 output.put(parseD(src, type, pos)); 129 break; 130 131 case directive: 132 parseDirective(src, pos, output); 133 break; 134 } 135 136 goto next; 137 } 138 } 139 output.put(Token(TokenType.text, src.rem.source, src.curPos())); 140 src.rem = src.rem[$..$]; 141 next: 142 } 143 output.put(Token(TokenType.directive, "EOF", src.curPos())); 144 } 145 146 /// Parses something like "dCode(42); :]" when terminator == ":]" 147 Token parseD(Source* src, TokenType type, ref const(Pos) pos) pure 148 { 149 assert (type.among(TokenType.statements, TokenType.expressions)); 150 /* 151 Expressions must be fully balanced. We need to check this to disambiguate the ] terminator. 152 Statements don't need to be balanced. E.g. (contrived example): 153 lorem ipsum 154 [: processThunks([{ :] 155 Here's a lambda in an array passed to a function call. 156 [: }]); :] 157 */ 158 Nestables balance_what = kAllNestables; 159 string terminator = "]"; 160 if (type == TokenType.statements) 161 { 162 balance_what = Nestables.init; 163 terminator = ":]"; 164 } 165 try 166 { 167 auto rem = dSkipFind(src.rem, terminator.byCodeUnit, balance_what, src.offset); 168 const value = src.rem[0..src.rem.length-rem.length]; 169 src.rem = rem[terminator.length..$]; 170 return makeCodeToken(type, value, pos); 171 } 172 catch (LexicalException e) 173 { 174 const e_pos = src.posAt(e.offset); 175 throw syntaxException(e.msg, e_pos); 176 } 177 } 178 179 pure 180 unittest 181 { 182 bool test(string code, TokenType type, Token exp) pure 183 { 184 auto src = new Source("foo.dj", code); 185 const pos = src.curPos(); 186 auto result = parseD(src, type, pos); 187 exp.pos = result.pos; // Ignore pos for this test 188 return result == exp; 189 } 190 191 with (TokenType) 192 { 193 assert (test("dCode(42); :]", statements, Token(statements, "dCode(42); "))); 194 assert (test("writeln(`:]`); :]", statements, Token(statements, "writeln(`:]`); "))); 195 assert (test("processThunks([{ :]", statements, Token(statements, "processThunks([{ "))); 196 assert (test(":]", statements, Token(statements, ""))); 197 assertThrown(test("]]", statements, Token.init)); 198 assertThrown(test("", statements, Token.init)); 199 200 assert (test("dCode(42) ]", expressions, Token(expressions, "dCode(42) "))); 201 assert (test("1, 2, 3]", expressions, Token(expressions, "1, 2, 3"))); 202 assert (test("[]]", expressions, Token(expressions, "[]"))); 203 assert (test("`]` ]", expressions, Token(expressions, "`]` "))); 204 assert (test(`q"[[]]"]`, expressions, Token(expressions, `q"[[]]"`))); 205 assertThrown(test("processThunks([{ ]", expressions, Token.init)); 206 assert (test("]", expressions, Token(expressions, ""))); 207 } 208 } 209 210 /// Parses something like "foo bar >]" 211 void parseDirective(Source* src, ref const(Pos) pos, ref TokenSink output) pure 212 { 213 import std.conv : text; 214 auto pieces = src.rem.findSplit(">]".byCodeUnit); 215 if (!pieces) throw syntaxException("no >] terminator found", pos); 216 src.rem = pieces[2]; 217 auto directive = makeCodeToken(TokenType.directive, pieces[0], pos); 218 output.put(directive); 219 auto args = directive.value.splitter.filter!(e => !e.empty); 220 if (args.empty || args.front != "raw") 221 { 222 return; 223 } 224 args.popFront(); 225 226 string8b end_tag; 227 if (!args.empty) 228 { 229 static import std.ascii; 230 end_tag = args.front.byCodeUnit; 231 enforce(end_tag.all!(c => std.ascii.isAlphaNum(c) || c == '_'), syntaxException(text("raw directive ending tag must be identifier using ASCII letters, numbers and underscores (got ", end_tag, ")"), src)); 232 args.popFront(); 233 enforce(args.empty, syntaxException("extra arguments for raw directive (maximum 1 supported)", src)); 234 } 235 236 // Search for [< endraw TAG !} 237 foreach (j; 0..src.rem.length) 238 { 239 auto match = endRawMatch(src.rem[j..$], end_tag); 240 if (match.empty) continue; 241 const raw_text_pos = src.curPos(); 242 output.put(makeCodeToken(TokenType.text, src.rem[0..j], raw_text_pos)); 243 src.rem = src.rem[j..$]; 244 const end_tag_pos = src.curPos(); 245 output.put(makeCodeToken(TokenType.directive, match[2..$-2], end_tag_pos)); 246 src.rem = src.rem[match.length..$]; 247 return; 248 } 249 250 throw syntaxException(text("no [< endraw", end_tag.empty ? "" : " ", end_tag, " >] terminator found"), src); 251 } 252 253 unittest 254 { 255 bool test(string code, Token[] exp) @trusted 256 { 257 TokenSink app; 258 auto src = new Source("foo.dj", code); 259 const pos = src.curPos(); 260 parseDirective(src, pos, app); 261 if (app[].length != exp.length) return false; 262 foreach (r, e; lockstep(app[], exp)) 263 { 264 r.pos = e.pos; // Ignore pos for this test 265 if (r != e) return false; 266 } 267 return true; 268 } 269 270 assertThrown(test("", [])); 271 272 assert (test("foo bar >]", [Token(TokenType.directive, "foo bar ")])); 273 assert (test(">]", [Token(TokenType.directive, "")])); 274 assert (test("foo|>] lorem ipsum", [Token(TokenType.directive, "foo", Pos.init, WhitespaceStripping.right)])); 275 assert (test("| foo >]", [Token(TokenType.directive, " foo ", Pos.init, WhitespaceStripping.left)])); 276 assert (test("| foo |>]", [Token(TokenType.directive, " foo ", Pos.init, WhitespaceStripping.both)])); 277 278 assert (test("raw>][<endraw>]lorem ipsum", [ 279 Token(TokenType.directive, "raw"), 280 Token(TokenType.text, ""), 281 Token(TokenType.directive, "endraw"), 282 ])); 283 284 assert (test("| raw >]lorem ipsum[< endraw |>]", [ 285 Token(TokenType.directive, " raw ", Pos.init, WhitespaceStripping.left), 286 Token(TokenType.text, "lorem ipsum"), 287 Token(TokenType.directive, " endraw ", Pos.init, WhitespaceStripping.right), 288 ])); 289 290 assert (test("raw|>]lorem [< ipsum[<|endraw>]", [ 291 Token(TokenType.directive, "raw", Pos.init, WhitespaceStripping.right), 292 Token(TokenType.text, "lorem [< ipsum"), 293 Token(TokenType.directive, "endraw", Pos.init, WhitespaceStripping.left), 294 ])); 295 296 assert (test("raw tag>][< endraw >][<endraw tag>]", [ 297 Token(TokenType.directive, "raw tag"), 298 Token(TokenType.text, "[< endraw >]"), 299 Token(TokenType.directive, "endraw tag"), 300 ])); 301 302 assert (test("raw tag >][< endraw >][<endraw tag>]", [ 303 Token(TokenType.directive, "raw tag "), 304 Token(TokenType.text, "[< endraw >]"), 305 Token(TokenType.directive, "endraw tag"), 306 ])); 307 308 assertThrown(test("raw>]", [])); 309 assertThrown(test("raw tag>][<endraw>]", [])); 310 } 311 312 /** 313 Matches an endraw directive (with possible tag) at start of str 314 315 Returns: matching substring, or empty if not matched 316 */ 317 string8b endRawMatch(string8b str, string8b tag) pure 318 { 319 // Equivalent to regex `^[<|?\s*endraw\s*TAG\s*|?>]` 320 enum miss = string8b.init; 321 auto s = str; 322 if (!s.startsWith("[<")) return miss; 323 s = s[2..$]; 324 if (s.empty) return miss; 325 if (s.front == '|') s.popFront(); 326 s = s.source.stripLeft.byCodeUnit; 327 if (!s.startsWith("endraw")) return miss; 328 s = s["endraw".length..$]; 329 s = s.source.stripLeft.byCodeUnit; 330 if (!s.startsWith(tag)) return miss; 331 s = s[tag.length..$]; 332 s = s.source.stripLeft.byCodeUnit; 333 if (s.empty) return miss; 334 if (s.front == '|') s.popFront(); 335 if (!s.startsWith(">]")) return miss; 336 s = s[2..$]; 337 return str[0..str.length-s.length]; 338 } 339 340 pure 341 unittest 342 { 343 bool test(string str, string tag, string result) 344 { 345 return endRawMatch(str.byCodeUnit, tag.byCodeUnit) == result.byCodeUnit; 346 } 347 348 assert (test("", "", "")); 349 assert (test("nope", "", "")); 350 assert (test("", "nope", "")); 351 assert (test("nope", "nope", "")); 352 353 assert (test("[<endraw>]", "", "[<endraw>]")); 354 assert (test("[< endraw >]", "", "[< endraw >]")); 355 assert (test("[< endraw >]", "", "[< endraw >]")); 356 assert (test("[< endraw >] lorem ipsum", "", "[< endraw >]")); 357 assert (test("[<| endraw |>] lorem ipsum", "", "[<| endraw |>]")); 358 assert (test("[< endraw >] lorem ipsum", "tag", "")); 359 assert (test("[< endraw tag >] lorem ipsum", "tag", "[< endraw tag >]")); 360 assert (test("[<endraw tag>] lorem ipsum", "tag", "[<endraw tag>]")); 361 } 362 363 Token makeCodeToken(TokenType type, string8b code, ref const(Pos) pos) pure 364 { 365 WhitespaceStripping ws_strip; 366 if (code.startsWith("|")) 367 { 368 ws_strip |= WhitespaceStripping.left; 369 code = code[1..$]; 370 } 371 if (code.endsWith("|")) 372 { 373 ws_strip |= WhitespaceStripping.right; 374 code = code[0..$-1]; 375 } 376 return Token(type, code.source, pos, ws_strip); 377 }