djinn.parsing source code

1 module djinn.parsing;
2 
3 @safe:
4 
5 import std.algorithm;
6 import std.array;
7 import std.exception;
8 import std.range;
9 import std..string;
10 import std.utf;
11 
12 import djinn.lexing;
13 import djinn.types;
14 
15 package:
16 
17 /// Turns Djinn code into post-processed tokens (entry point to all Djinn parsing)
18 Token[] getTokens(Source* src) pure
19 {
20 	if (src.empty) return Token[].init;
21 	auto tokens_app = appender!(Token[]);
22 	parseTokens(src, tokens_app);
23 	auto tokens = tokens_app[];
24 	return trimWhitespace(tokens, src.line_offsets);
25 }
26 
27 private:
28 
29 /**
30   Post-processes an array of tokens to implement whitespace-trimming rules
31 
32   These rules allow using newlines, etc., to format Djinn code, without affecting the output
33 */
34 Token[] trimWhitespace(Token[] tokens, const(size_t[]) line_offsets) pure
35 {
36 	// Djinn's whitespace rule: we don't render lines that contain at least some [< or {% tokens and nothing else but whitespace
37 	// This loop scans each line and checks the tokens intersecting them in total time O(num_lines + num_tokens), trimming whitespace from text tokens if the rule is triggered
38 	size_t line_start_token_idx = 0;
39 	foreach (j; 0..line_offsets.length-1)
40 	{
41 		const line_offset = line_offsets[j];
42 		const line_end_offset = line_offsets[j+1];
43 
44 		while (tokens[line_start_token_idx+1].pos.offset <= line_offset)
45 		{
46 			line_start_token_idx++;
47 		}
48 
49 		size_t line_end_token_idx = line_start_token_idx+1;
50 		bool has_exprs = tokens[line_start_token_idx].type == TokenType.expressions, is_all_text = tokens[line_start_token_idx].type == TokenType.text;
51 		while (tokens[line_end_token_idx].pos.offset < line_end_offset)
52 		{
53 			has_exprs |= tokens[line_end_token_idx].type == TokenType.expressions;
54 			is_all_text &= tokens[line_end_token_idx].type == TokenType.text;
55 			line_end_token_idx++;
56 		}
57 
58 		if (has_exprs || is_all_text)
59 		{
60 			line_start_token_idx = line_end_token_idx - 1;
61 			continue;
62 		}
63 
64 		bool rendersWhiteOnLine(ref const(Token) token)
65 		{
66 			assert (token.type != TokenType.expressions);
67 			if (token.type != TokenType.text) return true;
68 			import std.uni : isWhite;
69 			string value = token.value;
70 			if (token.pos.offset < line_offset) value = value[line_offset-token.pos.offset..$];
71 			if (token.pos.offset + token.value.length > line_end_offset) value = value[0..$-(token.pos.offset+token.value.length-line_end_offset)];
72 			return value.all!(isWhite);
73 		}
74 		if (!tokens[line_start_token_idx..line_end_token_idx].all!rendersWhiteOnLine) continue;
75 
76 		if (tokens[line_start_token_idx].type == TokenType.text)
77 		{
78 			const t = tokens[line_start_token_idx];
79 			tokens[line_start_token_idx].value = t.value[0..line_offset-t.pos.offset];
80 		}
81 		foreach (k; line_start_token_idx+1..line_end_token_idx-1) if (tokens[k].type == TokenType.text) tokens[k].value = "";
82 		if (tokens[line_end_token_idx-1].type == TokenType.text)
83 		{
84 			const t = tokens[line_end_token_idx-1];
85 			tokens[line_end_token_idx-1].value = t.value[line_end_offset-t.pos.offset..$];
86 		}
87 
88 		line_start_token_idx = line_end_token_idx - 1;
89 	}
90 
91 	// This handles the [:- and -:] whitespace-stripping feature, as in Jinja2
92 	foreach (j; 0..tokens.length-1)
93 	{
94 		// FIXME: correct pos
95 		if (tokens[j].whitespace_stripping & WhitespaceStripping.right) tokens[j+1].value = stripLeft(tokens[j+1].value);
96 		if (tokens[j+1].whitespace_stripping & WhitespaceStripping.left) tokens[j].value = stripRight(tokens[j].value);
97 	}
98 	return tokens;
99 }
100 
101 /// Parses complete Djinn code, outputting tokens
102 void parseTokens(Source* src, ref TokenSink output) pure
103 {
104 	while (!src.empty)
105 	{
106 		foreach (j; 0..src.rem.length-1)
107 		{
108 			if (src.rem[j] == '[')
109 			{
110 				auto type = cast(TokenType)src.rem[j+1].among(':', '=', '<');
111 				if (type == TokenType.text) continue;
112 
113 				if (j > 0) output.put(Token(TokenType.text, src.rem[0..j].source, src.curPos()));
114 				src.rem = src.rem[j..$];
115 				const pos = src.curPos();
116 				src.rem = src.rem[2..$];
117 
118 				with (TokenType) final switch (type)
119 				{
120 					case text:
121 						assert (false);
122 
123 					case statements:
124 						output.put(parseD(src, type, pos));
125 						break;
126 
127 					case expressions:
128 						output.put(parseD(src, type, pos));
129 						break;
130 
131 					case directive:
132 						parseDirective(src, pos, output);
133 						break;
134 				}
135 
136 				goto next;
137 			}
138 		}
139 		output.put(Token(TokenType.text, src.rem.source, src.curPos()));
140 		src.rem = src.rem[$..$];
141 next:
142 	}
143 	output.put(Token(TokenType.directive, "EOF", src.curPos()));
144 }
145 
146 /// Parses something like "dCode(42); :]" when terminator == ":]"
147 Token parseD(Source* src, TokenType type, ref const(Pos) pos) pure
148 {
149 	assert (type.among(TokenType.statements, TokenType.expressions));
150 	/*
151 		Expressions must be fully balanced.  We need to check this to disambiguate the ] terminator.
152 		Statements don't need to be balanced.  E.g. (contrived example):
153 			lorem ipsum
154 			[: processThunks([{ :]
155 				Here's a lambda in an array passed to a function call.
156 			[: }]); :]
157 	*/
158 	Nestables balance_what = kAllNestables;
159 	string terminator = "]";
160 	if (type == TokenType.statements)
161 	{
162 		balance_what = Nestables.init;
163 		terminator = ":]";
164 	}
165 	try
166 	{
167 		auto rem = dSkipFind(src.rem, terminator.byCodeUnit, balance_what, src.offset);
168 		const value = src.rem[0..src.rem.length-rem.length];
169 		src.rem = rem[terminator.length..$];
170 		return makeCodeToken(type, value, pos);
171 	}
172 	catch (LexicalException e)
173 	{
174 		const e_pos = src.posAt(e.offset);
175 		throw syntaxException(e.msg, e_pos);
176 	}
177 }
178 
179 pure
180 unittest
181 {
182 	bool test(string code, TokenType type, Token exp) pure
183 	{
184 		auto src = new Source("foo.dj", code);
185 		const pos = src.curPos();
186 		auto result = parseD(src, type, pos);
187 		exp.pos = result.pos;  // Ignore pos for this test
188 		return result == exp;
189 	}
190 
191 	with (TokenType)
192 	{
193 		assert (test("dCode(42); :]", statements, Token(statements, "dCode(42); ")));
194 		assert (test("writeln(`:]`); :]", statements, Token(statements, "writeln(`:]`); ")));
195 		assert (test("processThunks([{ :]", statements, Token(statements, "processThunks([{ ")));
196 		assert (test(":]", statements, Token(statements, "")));
197 		assertThrown(test("]]", statements, Token.init));
198 		assertThrown(test("", statements, Token.init));
199 
200 		assert (test("dCode(42) ]", expressions, Token(expressions, "dCode(42) ")));
201 		assert (test("1, 2, 3]", expressions, Token(expressions, "1, 2, 3")));
202 		assert (test("[]]", expressions, Token(expressions, "[]")));
203 		assert (test("`]` ]", expressions, Token(expressions, "`]` ")));
204 		assert (test(`q"[[]]"]`, expressions, Token(expressions, `q"[[]]"`)));
205 		assertThrown(test("processThunks([{ ]", expressions, Token.init));
206 		assert (test("]", expressions, Token(expressions, "")));
207 	}
208 }
209 
210 /// Parses something like "foo bar >]"
211 void parseDirective(Source* src, ref const(Pos) pos, ref TokenSink output) pure
212 {
213 	import std.conv : text;
214 	auto pieces = src.rem.findSplit(">]".byCodeUnit);
215 	if (!pieces) throw syntaxException("no >] terminator found", pos);
216 	src.rem = pieces[2];
217 	auto directive = makeCodeToken(TokenType.directive, pieces[0], pos);
218 	output.put(directive);
219 	auto args = directive.value.splitter.filter!(e => !e.empty);
220 	if (args.empty || args.front != "raw")
221 	{
222 		return;
223 	}
224 	args.popFront();
225 
226 	string8b end_tag;
227 	if (!args.empty)
228 	{
229 		static import std.ascii;
230 		end_tag = args.front.byCodeUnit;
231 		enforce(end_tag.all!(c => std.ascii.isAlphaNum(c) || c == '_'), syntaxException(text("raw directive ending tag must be identifier using ASCII letters, numbers and underscores (got ", end_tag, ")"), src));
232 		args.popFront();
233 		enforce(args.empty, syntaxException("extra arguments for raw directive (maximum 1 supported)", src));
234 	}
235 
236 	// Search for [< endraw TAG !}
237 	foreach (j; 0..src.rem.length)
238 	{
239 		auto match = endRawMatch(src.rem[j..$], end_tag);
240 		if (match.empty) continue;
241 		const raw_text_pos = src.curPos();
242 		output.put(makeCodeToken(TokenType.text, src.rem[0..j], raw_text_pos));
243 		src.rem = src.rem[j..$];
244 		const end_tag_pos = src.curPos();
245 		output.put(makeCodeToken(TokenType.directive, match[2..$-2], end_tag_pos));
246 		src.rem = src.rem[match.length..$];
247 		return;
248 	}
249 
250 	throw syntaxException(text("no [< endraw", end_tag.empty ? "" : " ", end_tag, " >] terminator found"), src);
251 }
252 
253 unittest
254 {
255 	bool test(string code, Token[] exp) @trusted
256 	{
257 		TokenSink app;
258 		auto src = new Source("foo.dj", code);
259 		const pos = src.curPos();
260 		parseDirective(src, pos, app);
261 		if (app[].length != exp.length) return false;
262 		foreach (r, e; lockstep(app[], exp))
263 		{
264 			r.pos = e.pos;  // Ignore pos for this test
265 			if (r != e) return false;
266 		}
267 		return true;
268 	}
269 
270 	assertThrown(test("", []));
271 
272 	assert (test("foo bar >]", [Token(TokenType.directive, "foo bar ")]));
273 	assert (test(">]", [Token(TokenType.directive, "")]));
274 	assert (test("foo|>] lorem ipsum", [Token(TokenType.directive, "foo", Pos.init, WhitespaceStripping.right)]));
275 	assert (test("| foo >]", [Token(TokenType.directive, " foo ", Pos.init, WhitespaceStripping.left)]));
276 	assert (test("| foo |>]", [Token(TokenType.directive, " foo ", Pos.init, WhitespaceStripping.both)]));
277 
278 	assert (test("raw>][<endraw>]lorem ipsum", [
279 		Token(TokenType.directive, "raw"),
280 		Token(TokenType.text, ""),
281 		Token(TokenType.directive, "endraw"),
282 	]));
283 
284 	assert (test("| raw >]lorem ipsum[< endraw |>]", [
285 		Token(TokenType.directive, " raw ", Pos.init, WhitespaceStripping.left),
286 		Token(TokenType.text, "lorem ipsum"),
287 		Token(TokenType.directive, " endraw ", Pos.init, WhitespaceStripping.right),
288 	]));
289 
290 	assert (test("raw|>]lorem [< ipsum[<|endraw>]", [
291 		Token(TokenType.directive, "raw", Pos.init, WhitespaceStripping.right),
292 		Token(TokenType.text, "lorem [< ipsum"),
293 		Token(TokenType.directive, "endraw", Pos.init, WhitespaceStripping.left),
294 	]));
295 
296 	assert (test("raw tag>][< endraw >][<endraw tag>]", [
297 		Token(TokenType.directive, "raw tag"),
298 		Token(TokenType.text, "[< endraw >]"),
299 		Token(TokenType.directive, "endraw tag"),
300 	]));
301 
302 	assert (test("raw tag >][< endraw >][<endraw tag>]", [
303 		Token(TokenType.directive, "raw tag "),
304 		Token(TokenType.text, "[< endraw >]"),
305 		Token(TokenType.directive, "endraw tag"),
306 	]));
307 
308 	assertThrown(test("raw>]", []));
309 	assertThrown(test("raw tag>][<endraw>]", []));
310 }
311 
312 /**
313   Matches an endraw directive (with possible tag) at start of str
314 
315 	Returns: matching substring, or empty if not matched
316 */
317 string8b endRawMatch(string8b str, string8b tag) pure
318 {
319 	// Equivalent to regex `^[<|?\s*endraw\s*TAG\s*|?>]`
320 	enum miss = string8b.init;
321 	auto s = str;
322 	if (!s.startsWith("[<")) return miss;
323 	s = s[2..$];
324 	if (s.empty) return miss;
325 	if (s.front == '|') s.popFront();
326 	s = s.source.stripLeft.byCodeUnit;
327 	if (!s.startsWith("endraw")) return miss;
328 	s = s["endraw".length..$];
329 	s = s.source.stripLeft.byCodeUnit;
330 	if (!s.startsWith(tag)) return miss;
331 	s = s[tag.length..$];
332 	s = s.source.stripLeft.byCodeUnit;
333 	if (s.empty) return miss;
334 	if (s.front == '|') s.popFront();
335 	if (!s.startsWith(">]")) return miss;
336 	s = s[2..$];
337 	return str[0..str.length-s.length];
338 }
339 
340 pure
341 unittest
342 {
343 	bool test(string str, string tag, string result)
344 	{
345 		return endRawMatch(str.byCodeUnit, tag.byCodeUnit) == result.byCodeUnit;
346 	}
347 
348 	assert (test("", "", ""));
349 	assert (test("nope", "", ""));
350 	assert (test("", "nope", ""));
351 	assert (test("nope", "nope", ""));
352 
353 	assert (test("[<endraw>]", "", "[<endraw>]"));
354 	assert (test("[< endraw >]", "", "[< endraw >]"));
355 	assert (test("[<  endraw  >]", "", "[<  endraw  >]"));
356 	assert (test("[< endraw >] lorem ipsum", "", "[< endraw >]"));
357 	assert (test("[<| endraw |>] lorem ipsum", "", "[<| endraw |>]"));
358 	assert (test("[< endraw >] lorem ipsum", "tag", ""));
359 	assert (test("[< endraw tag >] lorem ipsum", "tag", "[< endraw tag >]"));
360 	assert (test("[<endraw tag>] lorem ipsum", "tag", "[<endraw tag>]"));
361 }
362 
363 Token makeCodeToken(TokenType type, string8b code, ref const(Pos) pos) pure
364 {
365 	WhitespaceStripping ws_strip;
366 	if (code.startsWith("|"))
367 	{
368 		ws_strip |= WhitespaceStripping.left;
369 		code = code[1..$];
370 	}
371 	if (code.endsWith("|"))
372 	{
373 		ws_strip |= WhitespaceStripping.right;
374 		code = code[0..$-1];
375 	}
376 	return Token(type, code.source, pos, ws_strip);
377 }