jaypha.inet.mime.reading source code

1 //Written in the D programming language
2 /*
3  * Routintes for reading and parsing MIME documents.
4  *
5  * Copyright: Copyright (C) 2013-2014 Jaypha
6  *
7  * Distributed under the Boost Software License, Version 1.0.
8  * (See http://www.boost.org/LICENSE_1_0.txt)
9  *
10  * Authors: Jason den Dulk
11  */
12 
13 module jaypha.inet.mime.reading;
14 
15 public import jaypha.inet.mime.header;
16 
17 enum mimeSpecials = "()<>@,;:\\\".[]";  // from RFC822
18 enum mimetSpecials = "()<>@,;:\\\"/[]?=";  //  from RFC2045
19 enum mimeLwsp = " \t";
20 enum mimeDelimeters = mimeSpecials ~ mimeLwsp;
21 enum mimeTokenDelimeters = mimetSpecials ~ mimeLwsp;
22 
23 import std.array;
24 import std.string;
25 import std.range;
26 import std.algorithm;
27 import std.traits;
28 
29 //-----------------------------------------------------------------------------
30 // extracts MIME parameters.
31 // Parameters are of the format *(';' atrribute '=' value)
32 // value = token / quoted-string.
33 // TODO updated def in RFC2231
34 
35 void extractMimeParams(string source, ref string[string] parameters)
36 {
37   skipSpaceComment(source);
38   while (!source.empty && source.front == ';')
39   {
40     source.popFront();
41     source.skipSpaceComment();
42     auto attribute = source.extractToken();
43     source.skipSpaceComment();
44     if (source.cfront != '=') throw new Exception("malformed MIME header");
45     source.popFront();
46     source.skipSpaceComment();
47     if (source.cfront == '\"')
48       parameters[attribute] = source.extractQuotedString();
49     else
50       parameters[attribute] = source.extractToken();
51     source.skipSpaceComment();
52   }
53 }
54 
55 //-----------------------------------------------------------------------------
56 
57 unittest
58 {
59   string t1 = "; bean  = (not this)rock ";
60   string t2 = ";dog(canine)  = \"A (canine) animal\"  ";
61   string t3 = " ;(c)rabbit=jack;jack=\"\"";
62   string t4 = ";";
63 
64   string[string] parms;
65 
66   extractMimeParams(t1,parms);
67   assert("bean" in parms);
68   assert(parms["bean"] = "rock");
69 
70   extractMimeParams(t2,parms);
71   assert("dog" in parms);
72   assert(parms["dog"] = "A (canine) animal");
73 
74   extractMimeParams(t3,parms);
75   assert("rabbit" in parms);
76   assert("jack" in parms);
77   assert(parms["rabbit"] == "jack");
78   assert(parms["jack"].empty);
79 
80   try {
81     extractMimeParams(t4,parms);
82     assert(false);
83   } catch (Exception e) {
84   }
85 }
86 
87 //-----------------------------------------------------------------------------
88 // Extracts a MIME token from the input string.
89 
90 string extractToken(ref string source)
91 {
92   auto remainder = findAmong(source, mimeTokenDelimeters);
93   auto token = source[0..$-remainder.length];
94   if (token.empty) throw new Exception("malformed MIME header");
95   source = remainder;
96   return token;
97 }
98 
99 //-----------------------------------------------------------------------------
100 // Extracts a quoted string token from the input string.
101 
102 string extractQuotedString(ref string source)
103 {
104   auto s = appender!string();
105 
106   source.popFront(); // front should be a \".
107   while (source.cfront != '\"')
108   {
109     if (source.front == '\\')
110       source.popFront();
111     s.put(source.cfront());
112     source.popFront();
113   }
114   source.popFront(); // front should be a \".
115   return s.data;
116 }
117 
118 //-----------------------------------------------------------------------------
119 
120 unittest
121 {
122   string t1 = "john@";
123   string t2 = "\" a quoted \\\" string\"g";
124   string t3 = "\"unfinished";
125 
126   assert(extractToken(t1) == "john");
127   assert(t1 == "@");
128 
129   assert(extractQuotedString(t2) == " a quoted \" string");
130   assert(t2 == "g");
131 
132   try
133   {
134     extractQuotedString(t3);
135     assert(false);
136   } catch (Exception e)
137   {
138     assert(t3.empty);
139   }
140 
141   try
142   {
143     t3 = "";
144     extractToken(t3);
145     assert(false);
146   } catch (Exception e)
147   {
148     assert(t3.empty);
149   }
150 
151   try
152   {
153     t3 = "";
154     extractToken(t3);
155     assert(false);
156   } catch (Exception e)
157   {
158     assert(t3.empty);
159   }
160 }
161 
162 //-----------------------------------------------------------------------------
163 // Skips all contiguous spaces and comments.
164 
165 void skipSpaceComment(ref string source)
166 {
167   skipSpace(source);
168   while (!source.empty && source.front == '(')
169   {
170     ulong count = 1;
171     source.popFront();
172     do
173     {
174       if (source.cfront == '\\')
175         source.popFront();
176       else
177       {
178         if (source.front == '(')
179           ++count;
180         else if (source.front == ')')
181           --count;
182       }
183       source.cpopFront();
184     } while (count != 0);
185     skipSpace(source);
186   }
187 }
188 
189 //-----------------------------------------------------------------------------
190 
191 void skipSpace(ref string source)
192 {
193   while (!source.empty && inPattern(source.front, mimeLwsp))
194     source.popFront();
195 }
196 
197 //-----------------------------------------------------------------------------
198 
199 unittest
200 {
201   string t1 = "  xyz";
202   string t2 = "   (comment1) (comment (2)(2))(tricky \\) comment)  non-comment";
203   string t3 = "(unfinished comment";
204   
205   skipSpace(t1);
206   assert(t1 == "xyz");
207 
208   skipSpaceComment(t2);
209   assert(t2 == "non-comment");
210 
211   try {
212     skipSpaceComment(t3);
213     assert(false);
214   } catch(Exception e)
215   {
216     assert(t3.empty);
217   }
218 }
219 
220 //-----------------------------------------------------------------------------
221 // "compulsory" front and popFront. Spits the dummy if empty.
222 
223 auto cfront(R)(ref R range) if (isInputRange!R)
224 {
225   if (range.empty) throw new Exception("malformed MIME header");
226   return range.front;
227 }
228 
229 void cpopFront(R)(ref R range) if (isInputRange!R)
230 {
231   if (range.empty) throw new Exception("malformed MIME header");
232   range.popFront();
233 }
234 
235 //-----------------------------------------------------------------------------
236 // Reads in headers from a MIME document. Unfolds multiline headers, but
237 // does not perform any other lexing of header field bodies.
238 // Does consume the empty line following headers.
239 // TODO: Allow to read from strings as well as octect streams.
240 
241 MimeHeader[] parseMimeHeaders(BR)(ref BR reader)
242   if ((isInputRange!BR && is(ElementType!BR : ubyte)))
243 {
244   MimeHeader[] headers;
245   /* Read headers until we get to a blank line */
246 
247   while (true)
248   {
249     auto buf = jaypha.algorithm.findSplit(reader, cast(ubyte[])MimeEoln);
250     if (buf[1] != cast(ubyte[]) MimeEoln) throw new Exception("malformed Mime Header");
251 
252     if (buf[0].length == 0) break;
253 
254     auto header = cast(string) buf[0];
255 
256     if (inPattern(header[0], mimeLwsp))
257     {
258       // leading whitespace means s part of the previous header.
259       headers[$-1].fieldBody ~= header;
260     }
261     else
262     {
263       auto buf2 = std.algorithm.findSplit(header,":");
264       if (buf2[1] != ":") throw new Exception("malformed Mime Header");
265       headers ~= MimeHeader(buf2[0], buf2[2]);
266     }
267   }
268   return headers;
269 }
270 
271 //-----------------------------------------------------------------------------
272 
273 unittest
274 {
275   string entity_text =
276     "Content-Type: text/plain; charset=us-ascii\r\n"
277     "Content-Disposition: blah blah \r\n"
278     "\tblah\r\n"
279     "\r\n"
280     "This is explicitly typed plain US-ASCII text.\r\n"
281     "It DOES end with a linebreak.\r\n";
282 
283   //auto r1 = inputRangeObject(cast(ubyte[]) entity_text.dup);
284   auto r1 = cast(ubyte[]) entity_text;
285 
286   auto headers = parseMimeHeaders(r1);
287   assert(headers.length == 2);
288   assert(headers[0].name == "Content-Type");
289   assert(headers[0].fieldBody == " text/plain; charset=us-ascii");
290   assert(headers[1].name == "Content-Disposition");
291   assert(headers[1].fieldBody == " blah blah \tblah");
292   assert(r1.front == 'T');
293 }
294 
295 //-----------------------------------------------------------------------------
296 // Entity Reader. Takes an input range representing a MIME document, extracts
297 // the headers and presents the rest for further reading.
298 
299 auto mimeEntityReader(BR)(BR reader)
300   if ((isInputRange!BR && is(ElementType!BR : ubyte)))
301 {
302   return MimeEntityReader!(BR)(parseMimeHeaders(reader),reader);
303 }
304 
305 struct MimeEntityReader(BR)
306 {
307   MimeHeader[] headers;
308   BR content;
309 }
310 
311 //-----------------------------------------------------------------------------
312 
313 unittest
314 {
315   import std.stdio;
316   import std.exception;
317   import std.array;
318   import std.algorithm;
319   import std.range;
320 
321   string entity_text =
322     "Content-Type: text/plain; charset=us-ascii\r\n"
323     "Content-Disposition: blah blah \r\n"
324     "\tblah\r\n"
325     "\r\n"
326     "This is explicitly typed plain US-ASCII text.\r\n"
327     "It DOES end with a linebreak.\r\n";
328 
329   auto r1 = inputRangeObject(cast(ubyte[]) entity_text.dup);
330 
331   auto entity = mimeEntityReader(r1);
332 
333   static assert(is(typeof(entity.content) == typeof(r1)));
334   assert(entity.headers.length == 2);
335   assert(entity.headers[0].name == "Content-Type");
336   assert(entity.headers[0].fieldBody == " text/plain; charset=us-ascii");
337   assert(entity.headers[1].name == "Content-Disposition");
338   assert(entity.headers[1].fieldBody == " blah blah \tblah");
339 
340   auto buff = appender!(ubyte[]);
341 
342   entity.content.copy(buff);
343   assert(buff.data == 
344     "This is explicitly typed plain US-ASCII text.\r\n"
345     "It DOES end with a linebreak.\r\n");
346 }
347 
348 //-----------------------------------------------------------------------------
349 // Multipart Entity Reader. Takes an input range and converts it into an
350 // input range of Mime Entity Readers. Each element represents a Mime Entity.
351 // Presumes that headers of the primary entity have already been extracted.
352 // TODO: See if a way can be found to preserve the preamble and epilogue.
353 
354 import jaypha.algorithm;
355 import jaypha.range;
356 
357 auto mimeMultipartReader(BR)(ref BR reader, string boundary)
358   if (isInputRange!BR && is(ElementType!BR : ubyte))
359 {
360   string full_boundary = "\r\n--"~boundary;
361 
362   jaypha.algorithm.findSplit(reader, full_boundary[2..$]);
363   jaypha.algorithm.findSplit(reader, "\r\n"); // skip over whitespace, but don't bother checking.
364 
365   auto entity = mimeEntityReader(readUntil(reader, full_boundary));
366 
367   alias typeof(entity) T;
368 
369   struct MR
370   {
371     @property bool empty() { return reader.empty; }
372 
373     @property T front() { return entity; }
374 
375     void popFront()
376     {
377       if (!entity.content.empty) entity.content.drain(); // In case the user pops before fully reading the entity
378 
379       auto rem = jaypha.algorithm.findSplit(reader, MimeEoln); // skip over whitespace, but don't bother checking.
380       bool last_time = startsWith(rem[0], "--");
381       if (!last_time)
382       {
383         if (rem[1] != MimeEoln) throw new Exception("malformed MIME Entity");
384         entity = mimeEntityReader(readUntil(reader, full_boundary));
385       }
386       else
387       {
388         reader.drain(); // Skip epilogue;
389       }
390     }
391   }
392   return MR();
393 }
394 
395 //----------------------------------------------------------------------------
396 // Advances the input range until sentinal is found
397 
398 private bool skipOverUntil(Reader)(ref Reader r, string sentinel)
399 {
400   while (true)
401   {
402     if (cast(char)r.front == sentinel[0])
403       for (uint i=0; i<=sentinel.length; ++i)
404       {
405         if (i == sentinel.length)
406           return true;
407         if (r.empty)
408           return false;
409 
410         if (cast(char)r.front != sentinel[i])
411           break;
412 
413         r.popFront();
414         if (r.empty)
415           return false;
416       }
417     else
418     {
419       r.popFront();
420       if (r.empty)
421         return false;
422     }
423   }
424 }
425 
426 
427 unittest
428 {
429 
430   string preamble =
431     "This is the preamble.  It is to be ignored, though it\r\n"
432     "is a handy place for composition agents to include an\r\n"
433     "explanatory note to non-MIME conformant readers.\r\n"
434     "\r\n"
435     "--simple boundary  \t  \t\t \r\n"
436     "\r\n"
437     "This is implicitly typed plain US-ASCII text.\r\n"
438     "It does NOT end with a linebreak.\r\n"
439     "--simple boundary\r\n"
440     "Content-type: text/plain; charset=us-ascii\r\n"
441     "\r\n"
442     "This is explicitly typed plain US-ASCII text.\r\n"
443     "It DOES end with a linebreak.\r\n"
444     "\r\n"
445     "--simple boundary--\r\n"
446     "\r\n"
447     "This is the epilogue.  It is also to be ignored.\r\n";
448 
449   string preamble2 = "--simple boundary\r\nZBC";
450 
451   auto buff = appender!(ubyte[]);
452 
453   ubyte[] txt = cast(ubyte[]) "acabacbxyz".dup;
454 
455   string y = "abc";
456 
457   auto r1 = inputRangeObject(txt);
458 
459   auto x = r1.skipOverUntil("cbx");
460   assert(x);
461   r1.copy(buff);
462   assert(cast(char[])(buff.data) == "yz");
463 
464   buff.clear();
465 
466   txt = cast(ubyte[]) "acabacbxyz".dup;
467   r1 = inputRangeObject(txt);
468 
469   assert(!r1.skipOverUntil("c1bx"));
470   assert(r1.empty);
471 
472   txt = cast(ubyte[]) preamble.dup;
473   r1 = inputRangeObject(txt);
474 
475   auto r2 = mimeMultipartReader(r1, "simple boundary");
476 
477 
478   assert(r1.front == cast(ubyte)'T');
479   auto r3 = r2.front;
480 
481   assert(r3.headers.length == 0);
482   put(buff,r3.content);
483   assert(buff.data == "This is implicitly typed plain US-ASCII text.\r\n"
484     "It does NOT end with a linebreak.");
485   assert(r3.content.empty);
486   assert(r1.front == cast(ubyte)'\r');
487 
488   buff.clear();
489   r2.popFront();
490   r3 = r2.front;
491   assert(r3.headers.length == 1);
492   
493   r3.content.copy(buff);
494 
495   assert(buff.data ==
496     "This is explicitly typed plain US-ASCII text.\r\n"
497     "It DOES end with a linebreak.\r\n");
498   assert(r3.content.empty);
499   r2.popFront();
500   assert(r2.empty);
501   assert(r1.empty);
502 
503 }
504 
505 //----------------------------------------------------------------------------
506 // Comsumes the front of the range as long as it matches the given prefix
507 // Returns whether or not the entire prefix got matches. If all_or_nothing is
508 // true, then an exception occurs if prefix is nto matched in its entirely.
509 // Designed to work with ranges that cannot be rewound.
510 
511 bool skipOverAnyway(R)(ref R r, string prefix, bool all_or_nothing = false)
512  if (isInputRange!R)
513 {
514   if (r.empty || r.front != prefix[0])
515     return false;
516 
517   uint i = 0;
518   do
519   {
520     r.popFront();
521     ++i;
522   } while (i < prefix.length && !r.empty && r.front == prefix[i]);
523 
524   if (i == prefix.length) return true;
525   if (all_or_nothing) throw new Exception("malformed MIME Entity");
526   return false;
527 }
528 
529 unittest
530 {
531   ubyte[] txt = cast(ubyte[]) "acabacbxyz".dup;
532   auto r1 = inputRangeObject(txt);
533   auto buff = appender!(ubyte[])();
534 
535   assert(skipOverAnyway(r1, "aca"));
536   assert(!skipOverAnyway(r1, "baa"));
537   assert(!skipOverAnyway(r1, "xyz"));
538   try {
539     skipOverAnyway(r1,"cbz",true);
540     assert(false);
541   } catch (Exception e) {
542   }
543   r1.copy(buff);
544   assert(cast(char[])(buff.data) == "xyz");
545 }
546 
547 //----------------------------------------------------------------------------
548 // An alternative to std.algorithm.until that works with non-rewindable input
549 // ranges.
550 
551 auto readUntil(R,E)(ref R r, E sentinel)
552   if (isInputRange!R && isInputRange!E &&
553       isScalarType!(ElementType!E) && isScalarType!(ElementType!R))
554 {
555   alias ElementType!R T;
556 
557   //----------------------------------------------------
558 
559   final class ReadUntil
560   {
561     //------------------------------------
562 
563     bool empty = false;
564 
565     //------------------------------------
566 
567     @property T front()
568     {
569       if (idx < length) return sentinel[idx];
570       return r.front;
571     }
572 
573     //------------------------------------
574 
575     void popFront()
576     {
577       if (!empty)
578       {
579         if (idx < length)
580         {
581           ++idx;
582           if (idx == length)
583           {
584             idx = length = 0;
585             sentinelCheck();
586           }
587         }
588         else
589         {
590           r.popFront();
591           sentinelCheck();
592         }
593       }
594     }
595 
596     //------------------------------------
597 
598     void sentinelCheck()
599     {
600       if (r.empty) { empty = true; return; }
601       if (r.front != sentinel[0]) return;
602 
603       do
604       {
605         r.popFront();
606         ++length;
607         if (r.empty) break;
608       } while (length < sentinel.length && r.front == sentinel[length]);
609 
610       if (length == sentinel.length)
611         empty = true;
612     }
613 
614     //------------------------------------
615 
616     private:
617       size_t length = 0;
618       size_t idx = 0;
619   }
620 
621   return new ReadUntil();
622 }
623 
624 //----------------------------------------------------------------------------
625 
626 unittest
627 {
628   ubyte[] txt = cast(ubyte[]) "acabacbxyz".dup;
629 
630   auto buff = appender!(ubyte[]);
631 
632   auto r1 = inputRangeObject(txt);
633 
634   auto u = readUntil(r1,"acb");
635   u.copy(buff);
636   assert(cast(char[])(buff.data) == "acab");
637   buff.clear();
638   r1.copy(buff);
639   assert(cast(char[])(buff.data) == "xyz");
640 }
641