/[debian]/mimetic/trunk/mimetic/parser/itparser.h
ViewVC logotype

Contents of /mimetic/trunk/mimetic/parser/itparser.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1731 - (show annotations)
Fri May 15 15:35:42 2009 UTC (12 years, 1 month ago) by gregoa
File MIME type: text/plain
File size: 23018 byte(s)
New upstream release.
1 #ifndef _MIMETIC_PARSER_ITPARSER_H_
2 #define _MIMETIC_PARSER_ITPARSER_H_
3 #include <iterator>
4 #include <algorithm>
5 #include <stack>
6 #include <iostream>
7 #include <mimetic/tree.h>
8 #include <mimetic/utils.h>
9 #include <mimetic/mimeentity.h>
10
11
12 // FIXME: handle HigherLevelClosingBoundary
13
14 namespace mimetic
15 {
16
17 /// Parse the input reading from an iterator
18 template<typename Iterator,
19 typename ItCategory=typename std::iterator_traits<Iterator>::iterator_category>
20 struct IteratorParser
21 {
22 };
23
24 /*
25 * Input Iterator
26 */
27 template<typename Iterator>
28 struct IteratorParser<Iterator, std::input_iterator_tag>
29 {
30
31 IteratorParser(MimeEntity& me)
32 : m_me(me), m_iMask(imNone), m_lastBoundary(NoBoundary)
33 {
34 m_entityStack.push(&m_me);
35 }
36 virtual ~IteratorParser()
37 {
38 }
39 /**
40 * set the Ignore Mask to \p mask
41 */
42 void iMask(size_t mask) { m_iMask = mask; }
43 /**
44 * get the Ignore Mask
45 */
46 size_t iMask() const { return m_iMask; }
47 /**
48 * start parsing
49 */
50 void run(Iterator bit, Iterator eit)
51 {
52 m_bit = bit;
53 m_eit = eit;
54 doLoad();
55 }
56 protected:
57 typedef std::list<std::string> BoundaryList;
58 enum {
59 CR = 0xD,
60 LF = 0xA,
61 NL = '\n'
62 };
63 enum /* ParsingElem */ {
64 peIgnore,
65 pePreamble,
66 peBody,
67 peEpilogue
68 };
69 enum BoundaryType {
70 NoBoundary = 0,
71 Boundary,
72 ClosingBoundary,
73 HigherLevelBoundary
74 //, HigherLevelClosingBoundary
75 };
76 enum EntityType {
77 etRfc822,
78 etMsgRfc822,
79 etMultipart
80 };
81 // vars
82 MimeEntity& m_me;
83 Iterator m_bit, m_eit;
84 size_t m_iMask; // ignore mask
85 BoundaryList m_boundaryList;
86 BoundaryType m_lastBoundary;
87 std::stack<MimeEntity*> m_entityStack;
88
89 protected:
90 void appendPreambleBlock(const char* buf, int sz)
91 {
92 MimeEntity* pMe = m_entityStack.top();
93 pMe->body().preamble().append(buf,sz);
94 }
95
96 void appendEpilogueBlock(const char* buf, int sz)
97 {
98 MimeEntity* pMe = m_entityStack.top();
99 pMe->body().epilogue().append(buf,sz);
100 }
101
102 void appendBodyBlock(const char* buf, int sz)
103 {
104 MimeEntity* pMe = m_entityStack.top();
105 pMe->body().append(buf, sz);
106 }
107
108 std::string getBoundary()
109 {
110 const MimeEntity* pMe = m_entityStack.top();
111 const ContentType& ct = pMe->header().contentType();
112 return std::string("--") + ct.param("boundary");
113 }
114
115 void popChild()
116 {
117 m_entityStack.pop();
118 }
119
120 void pushNewChild()
121 {
122 MimeEntity* pMe = m_entityStack.top();
123 MimeEntity* pChild = new MimeEntity;
124 pMe->body().parts().push_back(pChild);
125 m_entityStack.push(pChild);
126 }
127
128 EntityType getType()
129 {
130 MimeEntity* pMe = m_entityStack.top();
131 const Header& h = pMe->header();
132 // will NOT be automatically created if it doesn't exists;
133 // null ContentType will be returned
134 const ContentType& ct = h.contentType();
135 if(ct.isMultipart())
136 return etMultipart;
137 else if (ct.type() == "message" && ct.subtype() == "rfc822")
138 return etMsgRfc822;
139 else
140 return etRfc822;
141 }
142
143 void addField(const std::string& name, const std::string& value)
144 {
145 MimeEntity* pMe = m_entityStack.top();
146 Header& h = pMe->header();
147 Header::iterator it = h.insert(h.end(), Field());
148 it->name(name);
149 it->value(value);
150 }
151
152 BoundaryType isBoundary(const std::string& line)
153 {
154 if(line.length() == 0 || line[0] != '-')
155 return m_lastBoundary = NoBoundary;
156
157 int level = 0; // multipart nesting level
158 int lineLen = line.length();
159 BoundaryList::const_iterator bit,eit;
160 bit = m_boundaryList.begin(), eit = m_boundaryList.end();
161 for(;bit != eit; ++bit, ++level)
162 {
163 const std::string& b = *bit;
164 int bLen = b.length();
165 if(line.compare(0, bLen, b) == 0)
166 {
167 // not the expected boundary, malformed msg
168 if(level > 0)
169 return m_lastBoundary=HigherLevelBoundary;
170 // plain boundary or closing boundary?
171 if(lineLen > bLen && line.compare(bLen,2,"--") == 0)
172 return m_lastBoundary = ClosingBoundary;
173 else
174 return m_lastBoundary = Boundary;
175 }
176 }
177 return m_lastBoundary = NoBoundary;
178 }
179 // is new line
180 inline bool isnl(char c) const
181 {
182 return (c == CR || c == LF);
183 }
184 // is a two char newline
185 inline bool isnl(char a, char b) const
186 {
187 if(a == CR || a == LF)
188 if(b == (a == CR ? LF : CR))
189 return true;
190 return false;
191 }
192 void doLoad()
193 {
194 loadHeader();
195 loadBody();
196 }
197 bool valid() const
198 {
199 return m_bit != m_eit;
200 }
201 void append(char*& buf, size_t& bufsz, char c, size_t& pos)
202 {
203 enum { alloc_block = 128};
204 if(pos == bufsz)
205 {
206 // allocate and init buffer
207 char* tmp = buf;
208 int oldBufsz = bufsz;
209 while(pos >= bufsz)
210 bufsz = bufsz + alloc_block;
211 buf = new char[bufsz+1];
212 if(tmp != 0)
213 {
214 assert(oldBufsz > 0);
215 memset(buf, 0, bufsz);
216 memcpy(buf, tmp, oldBufsz);
217 delete[] tmp;
218 }
219 }
220 buf[pos++] = c;
221 }
222 // parses the header and calls addField and pushChild
223 // to add fields and nested entities
224 void loadHeader()
225 {
226 enum {
227 sInit,
228 sIgnoreLine,
229 sNewline,
230 sWaitingName,
231 sWaitingValue,
232 sWaitingFoldedValue,
233 sName,
234 sValue,
235 sIgnoreHeader
236 };
237 register int status;
238 int pos;
239 char *name, *value;
240 size_t nBufSz, vBufSz, nPos, vPos;
241 char prev, c = 0;
242
243 name = value = 0;
244 pos = nBufSz = vBufSz = nPos = vPos = 0;
245 status = (m_iMask & imHeader ? sIgnoreHeader : sInit);
246 //status = sInit;
247 while(m_bit != m_eit)
248 {
249 c = *m_bit;
250 switch(status)
251 {
252 case sInit:
253 if(isnl(c))
254 status = sNewline;
255 else
256 status = sName;
257 continue;
258 case sIgnoreLine:
259 if(!isnl(c))
260 break;
261 status = sNewline;
262 continue;
263 case sNewline:
264 status = sWaitingName;
265 if(pos > 0)
266 {
267 pos = 0;
268 prev = c;
269 if(++m_bit == m_eit) goto out; //eof
270 c = *m_bit;
271 if(c == (prev == CR ? LF : CR))
272 {
273 --pos;
274 break;
275 } else
276 continue;
277 } else {
278 // empty line, end of header
279 prev = c;
280 if(++m_bit == m_eit) goto out; //eof
281 c = *m_bit;
282 if(c == (prev == CR ? LF : CR))
283 ++m_bit;
284 goto out;
285 }
286 case sWaitingName:
287 if(isblank(c))
288 {
289 // folded value
290 status = sWaitingFoldedValue;
291 continue;
292 }
293 // not blank, new field or empty line
294 if(nPos)
295 {
296 name[nPos] = 0;
297 // is not an empty field (name: \n)
298 if(vPos)
299 {
300 value[vPos] = 0;
301 addField(name,value);
302 } else
303 addField(name,"");
304 nPos = vPos = 0;
305 }
306 status = (isnl(c) ? sNewline : sName);
307 continue;
308 case sWaitingValue:
309 if(isblank(c))
310 break; // eat leading blanks
311 status = sValue;
312 continue;
313 case sWaitingFoldedValue:
314 if(isblank(c))
315 break; // eat leading blanks
316 append(value, vBufSz, ' ', vPos);
317 status = sValue;
318 continue;
319 case sName:
320 if(c > 32 && c < 127 && c != ':') {
321 if(nPos > 0 && isblank(name[nPos-1]))
322 {
323 /* "FIELDNAME BLANK+ c" found, consider that the first
324 body line */
325 onBlock(name, nPos, peBody);
326 goto out;
327 }
328 append(name, nBufSz, c, nPos);
329 } else if(c == ':') {
330 if(nPos == 0)
331 {
332 /* header line starting with ':', ignore the line */
333 status = sIgnoreLine;
334 continue;
335 }
336
337 /* malformed fix: remove any trailing blanks of the field
338 name */
339 while(nPos > 0 && isblank(name[nPos-1]))
340 nPos--;
341
342 status = sWaitingValue;
343 } else if(isblank(c)) {
344 /* blank after the field name -> malformed; it may be a
345 malformed field with trailing blank or
346 the start of the body; save the char so we can try to
347 recover later trimming the field name or push the
348 whole line to the body part with onBlock() */
349 append(name, nBufSz, c, nPos);
350 } else {
351 /* bad header line or blank line between header and body is
352 missing; consider we're in the first line of the body */
353 onBlock(name, nPos, peBody);
354 goto out;
355 }
356 break;
357 case sValue:
358 if(isnl(c))
359 {
360 status = sNewline;
361 continue;
362 }
363 append(value, vBufSz, c, vPos);
364 break;
365 case sIgnoreHeader:
366 if(isnl(c))
367 {
368 prev = c;
369 if(++m_bit == m_eit) goto out; //eof
370 c = *m_bit;
371 if(c == (prev == CR ? LF : CR))
372 ++m_bit;
373 if(pos == 0)
374 goto out; //empty line, eoh
375 pos = 0;
376 continue;
377 }
378 break;
379 }
380 ++m_bit; ++pos;
381 }
382 out:
383 if(name)
384 delete[] name;
385 if(value)
386 delete[] value;
387 return;
388 }
389 void loadBody()
390 {
391 switch(getType())
392 {
393 case etRfc822:
394 if(m_iMask & imBody)
395 jump_to_next_boundary();
396 else
397 copy_until_boundary(peBody);
398 break;
399 case etMultipart:
400 loadMultipart();
401 break;
402 case etMsgRfc822:
403 if(m_iMask & imChildParts)
404 jump_to_next_boundary();
405 else {
406 pushNewChild();
407 doLoad(); // load child entities
408 popChild();
409 }
410 break;
411 }
412 }
413 void loadMultipart()
414 {
415 std::string boundary = getBoundary();
416 m_boundaryList.push_front(boundary);
417 ParsingElem pe;
418 // preamble
419 pe = (m_iMask & imPreamble ? peIgnore : pePreamble );
420 copy_until_boundary(pe);
421 while(m_bit != m_eit)
422 {
423 switch(m_lastBoundary)
424 {
425 case NoBoundary:
426 return; // eof
427 case Boundary:
428 if(m_iMask & imChildParts)
429 jump_to_next_boundary();
430 else {
431 pushNewChild();
432 doLoad();
433 popChild();
434 }
435 break;
436 case ClosingBoundary:
437 m_boundaryList.erase(m_boundaryList.begin());
438 // epilogue
439 pe=(m_iMask & imEpilogue? peIgnore: peEpilogue);
440 copy_until_boundary(pe);
441 return;
442 case HigherLevelBoundary:
443 m_boundaryList.erase(m_boundaryList.begin());
444 return;
445 }
446 }
447 }
448 inline void onBlock(const char* block, int sz, ParsingElem pe)
449 {
450 switch(pe)
451 {
452 case peIgnore:
453 return;
454 case pePreamble:
455 appendPreambleBlock(block, sz);
456 break;
457 case peEpilogue:
458 appendEpilogueBlock(block, sz);
459 break;
460 case peBody:
461 appendBodyBlock(block, sz);
462 break;
463 }
464 }
465 void jump_to_next_boundary()
466 {
467 copy_until_boundary(peIgnore);
468 }
469 // this is where most of execution time is spent when parsing
470 // large messages; I'm using a plain char[] buffer instead of
471 // std::string because I want to be as fast as possible here
472 virtual void copy_until_boundary(ParsingElem pe)
473 {
474 size_t pos, lines, eomsz = 0;
475 register char c;
476 enum { nlsz = 1 };
477 const char *eom = 0;
478
479 enum { blksz = 4096 };
480 char block[blksz];
481 size_t blkpos = 0;
482 size_t sl_off = 0; // start of line offset into *block
483
484 pos = lines = 0;
485 while(m_bit != m_eit)
486 {
487 // if buffer is full
488 if(blkpos >= blksz - 2 - nlsz)
489 {
490 if(sl_off == 0)
491 {
492 // very long line found, assume it
493 // can't be a boundary and flush the buf
494 // with the partial line
495 block[blkpos] = 0;
496 onBlock(block, blkpos, pe);
497 blkpos = sl_off = 0;
498 } else {
499 // flush the buffer except the last
500 // (probably incomplete) line
501 size_t llen = blkpos - sl_off;
502 onBlock(block, sl_off, pe);
503 memmove(block, block + sl_off, llen);
504 sl_off = 0;
505 blkpos = llen;
506 }
507 }
508 c = *m_bit;
509 if(isnl(c))
510 {
511 char nlbuf[3] = { 0, 0, 0 };
512
513 nlbuf[0] = c; // save the current NL char in nlbuf
514
515 // save the second char of the NL sequence (if any) in nlbuf
516 if(++m_bit != m_eit)
517 {
518 char next = *m_bit;
519 if(next == (c == CR ? LF : CR))
520 {
521 nlbuf[1] = next; // save the next char in the NL seq
522 ++m_bit;
523 }
524 }
525
526 if(pos)
527 {
528 // not an empty row, is this a boundary?
529 block[blkpos] = 0;
530 if(block[sl_off] == '-' && sl_off < blkpos &&
531 block[sl_off+1] == '-')
532 {
533 std::string Line(block+sl_off, blkpos-sl_off);
534 if(isBoundary(Line))
535 {
536 // trim last newline
537 if (sl_off>=2)
538 {
539 int i = sl_off;
540 char a = block[--i];
541 char b = block[--i];
542
543 if(isnl(a,b))
544 sl_off -= 2;
545 else if(isnl(a))
546 sl_off--;
547
548 } else if (sl_off==1 && isnl(block[0])) {
549 sl_off--;
550 }
551 onBlock(block, sl_off, pe);
552 return;
553 }
554 }
555 // exit if this is the end of message
556 // marker
557 if(eom && pos >= eomsz)
558 {
559 char *line = block + sl_off;
560 size_t i = 0;
561 for(; i < eomsz; i++)
562 if(eom[i] != line[i])
563 break;
564 if(i==eomsz) // if eom found
565 {
566 onBlock(block, sl_off,
567 pe);
568 return;
569 }
570 }
571 }
572 // append the saved NL sequence
573 for(int i = 0; nlbuf[i] != 0; i++)
574 block[blkpos++] = nlbuf[i];
575 block[blkpos] = 0;
576 sl_off = blkpos;
577 pos = 0;
578 } else {
579 pos++; // line pos
580 block[blkpos++] = c;
581 ++m_bit;
582 }
583 }
584 // eof
585 block[blkpos] = 0;
586 onBlock(block, blkpos, pe);
587 }
588 };
589
590
591 /*
592 * Forward Iterator
593 */
594 template<typename Iterator>
595 struct IteratorParser<Iterator, std::forward_iterator_tag>:
596 public IteratorParser<Iterator, std::input_iterator_tag>
597 {
598 /* input_iterator ops
599 * *it = xxx
600 * X& op++
601 * X& op++(int)
602 */
603 typedef IteratorParser<Iterator, std::input_iterator_tag> base_type;
604 IteratorParser(MimeEntity& me)
605 : base_type(me)
606 {
607 }
608 };
609
610 /*
611 * Bidirectional Iterator
612 */
613 template<typename Iterator>
614 struct IteratorParser<Iterator, std::bidirectional_iterator_tag>:
615 public IteratorParser<Iterator, std::forward_iterator_tag>
616 {
617 typedef IteratorParser<Iterator, std::forward_iterator_tag> base_type;
618 IteratorParser(MimeEntity& me)
619 : base_type(me)
620 {
621 }
622 };
623
624 /*
625 * Random Access Iterator
626 */
627 template<typename Iterator>
628 struct IteratorParser<Iterator, std::random_access_iterator_tag>:
629 public IteratorParser<Iterator, std::bidirectional_iterator_tag>
630 {
631 typedef IteratorParser<Iterator, std::bidirectional_iterator_tag> base_type;
632 IteratorParser(MimeEntity& me)
633 : base_type(me)
634 {
635 }
636 private:
637 using base_type::peIgnore;
638 using base_type::pePreamble;
639 using base_type::peBody;
640 using base_type::peEpilogue;
641
642 using base_type::NoBoundary;
643 using base_type::Boundary;
644 using base_type::ClosingBoundary;
645 using base_type::HigherLevelBoundary;
646
647 using base_type::m_boundaryList;
648 using base_type::m_lastBoundary;
649 using base_type::m_entityStack;
650 using base_type::m_me;
651 using base_type::m_iMask;
652 using base_type::m_bit;
653 using base_type::m_eit;
654 using base_type::isnl;
655
656 typedef TreeNode<char> BoundaryTree;
657 inline void onBlock(Iterator bit, int size, ParsingElem pe)
658 {
659 if(pe == peIgnore)
660 return;
661 Iterator eit = bit + size;
662 MimeEntity* pMe = m_entityStack.top();
663 switch(pe)
664 {
665 case pePreamble:
666 pMe->body().preamble().append(bit, eit);
667 break;
668 case peEpilogue:
669 pMe->body().epilogue().append(bit, eit);
670 break;
671 case peBody:
672 pMe->body().append(bit, eit);
673 break;
674 }
675 }
676 void copy_until_boundary(ParsingElem pe)
677 {
678 // if we don't have any boundary copy until m_eit and return
679 if(m_boundaryList.empty())
680 {
681 onBlock(m_bit, m_eit-m_bit, pe);
682 m_bit = m_eit;
683 return;
684 }
685 // search for current boundary; if not found (i.e. malformed
686 // message) repeat the search for higher level boundary
687 // (slow just for malformed msg, very fast otherwise)
688 typename base_type::BoundaryList::const_iterator
689 bBit = m_boundaryList.begin(), bEit = m_boundaryList.end();
690 m_lastBoundary = NoBoundary;
691 int depth = 0;
692 for( ;bBit != bEit; ++bBit, ++depth)
693 {
694 const std::string& boundary = *bBit;
695 Iterator off;
696 if( (off=utils::find_bm(m_bit,m_eit,boundary)) != m_eit)
697 {
698 Iterator base = m_bit;
699 size_t block_sz = off - base;
700 m_lastBoundary =
701 (depth ? HigherLevelBoundary: Boundary);
702 off += boundary.length();
703 m_bit = off;
704 if(off<m_eit-1 && *off =='-' && *(off+1) == '-')
705 {
706 m_lastBoundary = ClosingBoundary;
707 m_bit = off + 2;
708 }
709 if(m_bit < m_eit-1 && isnl(*m_bit))
710 {
711 char c = *m_bit++;
712 char next = *m_bit;
713 if(isnl(next) && next != c)
714 ++m_bit;
715 }
716
717 // trim last newline
718 if(block_sz)
719 {
720 Iterator p = base + block_sz;
721 char a = *--p, b = *--p;
722 if(isnl(a,b))
723 block_sz -= 2;
724 else if(isnl(a))
725 block_sz--;
726 }
727 onBlock(base, block_sz, pe);
728 return;
729 } else {
730 onBlock(m_bit, m_eit-m_bit, pe);
731 m_bit = m_eit;
732 }
733 }
734 }
735 BoundaryTree m_boundaryTree;
736 void buildBoundaryTree()
737 {
738 m_boundaryTree = BoundaryTree(); // clear
739 typename base_type::BoundaryList::const_iterator
740 bit = m_boundaryList.begin(), eit = m_boundaryList.end();
741 BoundaryTree::NodeList *pChilds;
742 BoundaryTree::NodeList::iterator it;
743 int depth = 0;
744 for( ; bit != eit; ++bit)
745 {
746 pChilds = &m_boundaryTree.childList();
747 it = pChilds->begin();
748 const char *w = bit->c_str();
749 do
750 {
751 it = find_if(pChilds->begin(), pChilds->end(),
752 FindNodePred<char>(*w));
753 if( it == pChilds->end() )
754 it = pChilds->insert(pChilds->end(),*w);
755 pChilds = &it->childList();
756 depth++;
757 } while(*(++w));
758 }
759 }
760
761 };
762
763 }
764
765 #endif

  ViewVC Help
Powered by ViewVC 1.1.26