/[debian]/mimetic/trunk/mimetic/parser/itparser.h
ViewVC logotype

Contents of /mimetic/trunk/mimetic/parser/itparser.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 511 - (show annotations)
Sun Jan 21 20:13:31 2007 UTC (14 years, 5 months ago) by gregoa
File MIME type: text/plain
File size: 21682 byte(s)
* New upstream release.
1 #ifndef _MIMETIC_PARSER_ITPARSER_H_
2 #define _MIMETIC_PARSER_ITPARSER_H_
3 #include <iterator>
4 #include <algorithm>
5 #include <stack>
6 #include <iostream>
7 #include <mimetic/tree.h>
8 #include <mimetic/utils.h>
9 #include <mimetic/mimeentity.h>
10
11
12 // FIXME: handle HigherLevelClosingBoundary
13
14 namespace mimetic
15 {
16
17 /// Parse the input reading from an iterator
18 template<typename Iterator,
19 typename ItCategory=typename std::iterator_traits<Iterator>::iterator_category>
20 struct IteratorParser
21 {
22 };
23
24 /*
25 * Input Iterator
26 */
27 template<typename Iterator>
28 struct IteratorParser<Iterator, std::input_iterator_tag>
29 {
30
31 IteratorParser(MimeEntity& me)
32 : m_me(me), m_iMask(imNone), m_lastBoundary(NoBoundary)
33 {
34 m_entityStack.push(&m_me);
35 }
36 virtual ~IteratorParser()
37 {
38 }
39 /**
40 * set the Ignore Mask to \p mask
41 */
42 void iMask(size_t mask) { m_iMask = mask; }
43 /**
44 * get the Ignore Mask
45 */
46 size_t iMask() const { return m_iMask; }
47 /**
48 * start parsing
49 */
50 void run(Iterator bit, Iterator eit)
51 {
52 m_bit = bit;
53 m_eit = eit;
54 doLoad();
55 }
56 protected:
57 typedef std::list<std::string> BoundaryList;
58 enum {
59 CR = 0xD,
60 LF = 0xA,
61 NL = '\n'
62 };
63 enum /* ParsingElem */ {
64 peIgnore,
65 pePreamble,
66 peBody,
67 peEpilogue
68 };
69 enum BoundaryType {
70 NoBoundary = 0,
71 Boundary,
72 ClosingBoundary,
73 HigherLevelBoundary
74 //, HigherLevelClosingBoundary
75 };
76 enum EntityType {
77 etRfc822,
78 etMsgRfc822,
79 etMultipart
80 };
81 // vars
82 MimeEntity& m_me;
83 Iterator m_bit, m_eit;
84 size_t m_iMask; // ignore mask
85 BoundaryList m_boundaryList;
86 BoundaryType m_lastBoundary;
87 std::stack<MimeEntity*> m_entityStack;
88
89 protected:
90 void appendPreambleBlock(const char* buf, int sz)
91 {
92 MimeEntity* pMe = m_entityStack.top();
93 pMe->body().preamble().append(buf,sz);
94 }
95
96 void appendEpilogueBlock(const char* buf, int sz)
97 {
98 MimeEntity* pMe = m_entityStack.top();
99 pMe->body().epilogue().append(buf,sz);
100 }
101
102 void appendBodyBlock(const char* buf, int sz)
103 {
104 MimeEntity* pMe = m_entityStack.top();
105 pMe->body().append(buf, sz);
106 }
107
108 std::string getBoundary()
109 {
110 const MimeEntity* pMe = m_entityStack.top();
111 const ContentType& ct = pMe->header().contentType();
112 return std::string("--") + ct.param("boundary");
113 }
114
115 void popChild()
116 {
117 m_entityStack.pop();
118 }
119
120 void pushNewChild()
121 {
122 MimeEntity* pMe = m_entityStack.top();
123 MimeEntity* pChild = new MimeEntity;
124 pMe->body().parts().push_back(pChild);
125 m_entityStack.push(pChild);
126 }
127
128 EntityType getType()
129 {
130 MimeEntity* pMe = m_entityStack.top();
131 const Header& h = pMe->header();
132 // will NOT be automatically created if it doesn't exists;
133 // null ContentType will be returned
134 const ContentType& ct = h.contentType();
135 if(ct.isMultipart())
136 return etMultipart;
137 else if (ct.type() == "message" && ct.subtype() == "rfc822")
138 return etMsgRfc822;
139 else
140 return etRfc822;
141 }
142
143 void addField(const std::string& name, const std::string& value)
144 {
145 MimeEntity* pMe = m_entityStack.top();
146 Header& h = pMe->header();
147 Header::iterator it = h.insert(h.end(), Field());
148 it->name(name);
149 it->value(value);
150 }
151
152 BoundaryType isBoundary(const std::string& line)
153 {
154 if(line.length() == 0 || line[0] != '-')
155 return m_lastBoundary = NoBoundary;
156
157 int level = 0; // multipart nesting level
158 int lineLen = line.length();
159 BoundaryList::const_iterator bit,eit;
160 bit = m_boundaryList.begin(), eit = m_boundaryList.end();
161 for(;bit != eit; ++bit, ++level)
162 {
163 const std::string& b = *bit;
164 int bLen = b.length();
165 if(line.compare(0, bLen, b) == 0)
166 {
167 // not the expected boundary, malformed msg
168 if(level > 0)
169 return m_lastBoundary=HigherLevelBoundary;
170 // plain boundary or closing boundary?
171 if(lineLen > bLen && line.compare(bLen,2,"--") == 0)
172 return m_lastBoundary = ClosingBoundary;
173 else
174 return m_lastBoundary = Boundary;
175 }
176 }
177 return m_lastBoundary = NoBoundary;
178 }
179 // is new line
180 inline bool isnl(char c) const
181 {
182 return (c == CR || c == LF);
183 }
184 // is a two char newline
185 inline bool isnl(char a, char b) const
186 {
187 if(a == CR || a == LF)
188 if(b == (a == CR ? LF : CR))
189 return true;
190 return false;
191 }
192 void doLoad()
193 {
194 loadHeader();
195 loadBody();
196 }
197 bool valid() const
198 {
199 return m_bit != m_eit;
200 }
201 void append(char*& buf, size_t& bufsz, char c, size_t& pos)
202 {
203 enum { alloc_block = 128};
204 if(pos == bufsz)
205 {
206 // allocate and init buffer
207 char* tmp = buf;
208 int oldBufsz = bufsz;
209 while(pos >= bufsz)
210 bufsz = bufsz + alloc_block;
211 buf = new char[bufsz+1];
212 if(tmp != 0)
213 {
214 assert(oldBufsz > 0);
215 memset(buf, 0, bufsz);
216 memcpy(buf, tmp, oldBufsz);
217 delete[] tmp;
218 }
219 }
220 buf[pos++] = c;
221 }
222 // parses the header and calls addField and pushChild
223 // to add fields and nested entities
224 void loadHeader()
225 {
226 enum {
227 sInit,
228 sIgnoreLine,
229 sNewline,
230 sWaitingName,
231 sWaitingValue,
232 sWaitingFoldedValue,
233 sName,
234 sValue,
235 sIgnoreHeader
236 };
237 register int status;
238 int pos;
239 char *name, *value;
240 size_t nBufSz, vBufSz, nPos, vPos;
241 char prev, c = 0;
242
243 name = value = 0;
244 pos = nBufSz = vBufSz = nPos = vPos = 0;
245 status = (m_iMask & imHeader ? sIgnoreHeader : sInit);
246 //status = sInit;
247 while(m_bit != m_eit)
248 {
249 c = *m_bit;
250 switch(status)
251 {
252 case sInit:
253 if(isnl(c))
254 status = sNewline;
255 else
256 status = sName;
257 continue;
258 case sIgnoreLine:
259 if(!isnl(c))
260 break;
261 status = sNewline;
262 continue;
263 case sNewline:
264 status = sWaitingName;
265 if(pos > 0)
266 {
267 pos = 0;
268 prev = c;
269 if(++m_bit == m_eit) goto out; //eof
270 c = *m_bit;
271 if(c == (prev == CR ? LF : CR))
272 {
273 --pos;
274 break;
275 } else
276 continue;
277 } else {
278 // empty line, end of header
279 prev = c;
280 if(++m_bit == m_eit) goto out; //eof
281 c = *m_bit;
282 if(c == (prev == CR ? LF : CR))
283 ++m_bit;
284 goto out;
285 }
286 case sWaitingName:
287 if(isblank(c))
288 {
289 // folded value
290 status = sWaitingFoldedValue;
291 continue;
292 }
293 // not blank, new field or empty line
294 if(nPos)
295 {
296 name[nPos] = 0;
297 // is not an empty field (name: \n)
298 if(vPos)
299 {
300 value[vPos] = 0;
301 addField(name,value);
302 } else
303 addField(name,"");
304 nPos = vPos = 0;
305 }
306 status = (isnl(c) ? sNewline : sName);
307 continue;
308 case sWaitingValue:
309 if(isblank(c))
310 break; // eat leading blanks
311 status = sValue;
312 continue;
313 case sWaitingFoldedValue:
314 if(isblank(c))
315 break; // eat leading blanks
316 append(value, vBufSz, ' ', vPos);
317 status = sValue;
318 continue;
319 case sName:
320 if(c > 32 && c < 127 && c != ':') {
321 append(name, nBufSz, c, nPos);
322 } else if(c == ':') {
323 status = sWaitingValue;
324 } else {
325 nPos = 0;
326 status = sIgnoreLine;
327 continue;
328 }
329 break;
330 case sValue:
331 if(isnl(c))
332 {
333 status = sNewline;
334 continue;
335 }
336 append(value, vBufSz, c, vPos);
337 break;
338 case sIgnoreHeader:
339 if(isnl(c))
340 {
341 prev = c;
342 if(++m_bit == m_eit) goto out; //eof
343 c = *m_bit;
344 if(c == (prev == CR ? LF : CR))
345 ++m_bit;
346 if(pos == 0)
347 goto out; //empty line, eoh
348 pos = 0;
349 continue;
350 }
351 break;
352 }
353 ++m_bit; ++pos;
354 }
355 out:
356 if(name)
357 delete[] name;
358 if(value)
359 delete[] value;
360 return;
361 }
362 void loadBody()
363 {
364 switch(getType())
365 {
366 case etRfc822:
367 if(m_iMask & imBody)
368 jump_to_next_boundary();
369 else
370 copy_until_boundary(peBody);
371 break;
372 case etMultipart:
373 loadMultipart();
374 break;
375 case etMsgRfc822:
376 if(m_iMask & imChildParts)
377 jump_to_next_boundary();
378 else {
379 pushNewChild();
380 doLoad(); // load child entities
381 popChild();
382 }
383 break;
384 }
385 }
386 void loadMultipart()
387 {
388 std::string boundary = getBoundary();
389 m_boundaryList.push_front(boundary);
390 ParsingElem pe;
391 // preamble
392 pe = (m_iMask & imPreamble ? peIgnore : pePreamble );
393 copy_until_boundary(pe);
394 while(m_bit != m_eit)
395 {
396 switch(m_lastBoundary)
397 {
398 case NoBoundary:
399 return; // eof
400 case Boundary:
401 if(m_iMask & imChildParts)
402 jump_to_next_boundary();
403 else {
404 pushNewChild();
405 doLoad();
406 popChild();
407 }
408 break;
409 case ClosingBoundary:
410 m_boundaryList.erase(m_boundaryList.begin());
411 // epilogue
412 pe=(m_iMask & imEpilogue? peIgnore: peEpilogue);
413 copy_until_boundary(pe);
414 return;
415 case HigherLevelBoundary:
416 m_boundaryList.erase(m_boundaryList.begin());
417 return;
418 }
419 }
420 }
421 inline void onBlock(const char* block, int sz, ParsingElem pe)
422 {
423 switch(pe)
424 {
425 case peIgnore:
426 return;
427 case pePreamble:
428 appendPreambleBlock(block, sz);
429 break;
430 case peEpilogue:
431 appendEpilogueBlock(block, sz);
432 break;
433 case peBody:
434 appendBodyBlock(block, sz);
435 break;
436 }
437 }
438 void jump_to_next_boundary()
439 {
440 copy_until_boundary(peIgnore);
441 }
442 // this is where most of execution time is spent when parsing
443 // large messages; I'm using a plain char[] buffer instead of
444 // std::string because I want to be as fast as possible here
445 virtual void copy_until_boundary(ParsingElem pe)
446 {
447 size_t pos, lines, eomsz = 0;
448 register char c;
449 enum { nlsz = 1 };
450 const char *eom = 0;
451
452 enum { blksz = 4096 };
453 char block[blksz];
454 size_t blkpos = 0;
455 size_t sl_off = 0; // start of line offset into *block
456
457 pos = lines = 0;
458 while(m_bit != m_eit)
459 {
460 // if buffer is full
461 if(blkpos >= blksz - 2 - nlsz)
462 {
463 if(sl_off == 0)
464 {
465 // very long line found, assume it
466 // can't be a boundary and flush the buf
467 // with the partial line
468 block[blkpos] = 0;
469 onBlock(block, blkpos, pe);
470 blkpos = sl_off = 0;
471 } else {
472 // flush the buffer except the last
473 // (probably incomplete) line
474 size_t llen = blkpos - sl_off;
475 onBlock(block, sl_off, pe);
476 memmove(block, block + sl_off, llen);
477 sl_off = 0;
478 blkpos = llen;
479 }
480 }
481 c = *m_bit;
482 if(isnl(c))
483 {
484 char nlbuf[3] = { 0, 0, 0 };
485
486 nlbuf[0] = c; // save the current NL char in nlbuf
487
488 // save the second char of the NL sequence (if any) in nlbuf
489 if(++m_bit != m_eit)
490 {
491 char next = *m_bit;
492 if(next == (c == CR ? LF : CR))
493 {
494 nlbuf[1] = next; // save the next char in the NL seq
495 ++m_bit;
496 }
497 }
498
499 if(pos)
500 {
501 // not an empty row, is this a boundary?
502 block[blkpos] = 0;
503 if(block[sl_off] == '-' && sl_off < blkpos &&
504 block[sl_off+1] == '-')
505 {
506 std::string Line(block+sl_off, blkpos-sl_off);
507 if(isBoundary(Line))
508 {
509 // trim last newline
510 if (sl_off>=2)
511 {
512 int i = sl_off;
513 char a = block[--i];
514 char b = block[--i];
515
516 if(isnl(a,b))
517 sl_off -= 2;
518 else if(isnl(a))
519 sl_off--;
520
521 } else if (sl_off==1 && isnl(block[0])) {
522 sl_off--;
523 }
524 onBlock(block, sl_off, pe);
525 return;
526 }
527 }
528 // exit if this is the end of message
529 // marker
530 if(eom && pos >= eomsz)
531 {
532 char *line = block + sl_off;
533 size_t i = 0;
534 for(; i < eomsz; i++)
535 if(eom[i] != line[i])
536 break;
537 if(i==eomsz) // if eom found
538 {
539 onBlock(block, sl_off,
540 pe);
541 return;
542 }
543 }
544 }
545 // append the saved NL sequence
546 for(int i = 0; nlbuf[i] != 0; i++)
547 block[blkpos++] = nlbuf[i];
548 block[blkpos] = 0;
549 sl_off = blkpos;
550 pos = 0;
551 } else {
552 pos++; // line pos
553 block[blkpos++] = c;
554 ++m_bit;
555 }
556 }
557 // eof
558 block[blkpos] = 0;
559 onBlock(block, blkpos, pe);
560 }
561 };
562
563
564 /*
565 * Forward Iterator
566 */
567 template<typename Iterator>
568 struct IteratorParser<Iterator, std::forward_iterator_tag>:
569 public IteratorParser<Iterator, std::input_iterator_tag>
570 {
571 /* input_iterator ops
572 * *it = xxx
573 * X& op++
574 * X& op++(int)
575 */
576 typedef IteratorParser<Iterator, std::input_iterator_tag> base_type;
577 IteratorParser(MimeEntity& me)
578 : base_type(me)
579 {
580 }
581 };
582
583 /*
584 * Bidirectional Iterator
585 */
586 template<typename Iterator>
587 struct IteratorParser<Iterator, std::bidirectional_iterator_tag>:
588 public IteratorParser<Iterator, std::forward_iterator_tag>
589 {
590 typedef IteratorParser<Iterator, std::forward_iterator_tag> base_type;
591 IteratorParser(MimeEntity& me)
592 : base_type(me)
593 {
594 }
595 };
596
597 /*
598 * Random Access Iterator
599 */
600 template<typename Iterator>
601 struct IteratorParser<Iterator, std::random_access_iterator_tag>:
602 public IteratorParser<Iterator, std::bidirectional_iterator_tag>
603 {
604 typedef IteratorParser<Iterator, std::bidirectional_iterator_tag> base_type;
605 IteratorParser(MimeEntity& me)
606 : base_type(me)
607 {
608 }
609 private:
610 using base_type::peIgnore;
611 using base_type::pePreamble;
612 using base_type::peBody;
613 using base_type::peEpilogue;
614
615 using base_type::NoBoundary;
616 using base_type::Boundary;
617 using base_type::ClosingBoundary;
618 using base_type::HigherLevelBoundary;
619
620 using base_type::m_boundaryList;
621 using base_type::m_lastBoundary;
622 using base_type::m_entityStack;
623 using base_type::m_me;
624 using base_type::m_iMask;
625 using base_type::m_bit;
626 using base_type::m_eit;
627 using base_type::isnl;
628
629 typedef TreeNode<char> BoundaryTree;
630 inline void onBlock(Iterator bit, int size, ParsingElem pe)
631 {
632 if(pe == peIgnore)
633 return;
634 Iterator eit = bit + size;
635 MimeEntity* pMe = m_entityStack.top();
636 switch(pe)
637 {
638 case pePreamble:
639 pMe->body().preamble().append(bit, eit);
640 break;
641 case peEpilogue:
642 pMe->body().epilogue().append(bit, eit);
643 break;
644 case peBody:
645 pMe->body().append(bit, eit);
646 break;
647 }
648 }
649 void copy_until_boundary(ParsingElem pe)
650 {
651 // if we don't have any boundary copy until m_eit and return
652 if(m_boundaryList.empty())
653 {
654 onBlock(m_bit, m_eit-m_bit, pe);
655 m_bit = m_eit;
656 return;
657 }
658 // search for current boundary; if not found (i.e. malformed
659 // message) repeat the search for higher level boundary
660 // (slow just for malformed msg, very fast otherwise)
661 typename base_type::BoundaryList::const_iterator
662 bBit = m_boundaryList.begin(), bEit = m_boundaryList.end();
663 m_lastBoundary = NoBoundary;
664 int depth = 0;
665 for( ;bBit != bEit; ++bBit, ++depth)
666 {
667 const std::string& boundary = *bBit;
668 Iterator off;
669 if( (off=utils::find_bm(m_bit,m_eit,boundary)) != m_eit)
670 {
671 Iterator base = m_bit;
672 size_t block_sz = off - base;
673 m_lastBoundary =
674 (depth ? HigherLevelBoundary: Boundary);
675 off += boundary.length();
676 m_bit = off;
677 if(off<m_eit-1 && *off =='-' && *(off+1) == '-')
678 {
679 m_lastBoundary = ClosingBoundary;
680 m_bit = off + 2;
681 }
682 if(m_bit < m_eit-1 && isnl(*m_bit))
683 {
684 char c = *m_bit++;
685 char next = *m_bit;
686 if(isnl(next) && next != c)
687 ++m_bit;
688 }
689
690 // trim last newline
691 if(block_sz)
692 {
693 Iterator p = base + block_sz;
694 char a = *--p, b = *--p;
695 if(isnl(a,b))
696 block_sz -= 2;
697 else if(isnl(a))
698 block_sz--;
699 }
700 onBlock(base, block_sz, pe);
701 return;
702 } else {
703 onBlock(m_bit, m_eit-m_bit, pe);
704 m_bit = m_eit;
705 }
706 }
707 }
708 BoundaryTree m_boundaryTree;
709 void buildBoundaryTree()
710 {
711 m_boundaryTree = BoundaryTree(); // clear
712 typename base_type::BoundaryList::const_iterator
713 bit = m_boundaryList.begin(), eit = m_boundaryList.end();
714 BoundaryTree::NodeList *pChilds;
715 BoundaryTree::NodeList::iterator it;
716 int depth = 0;
717 for( ; bit != eit; ++bit)
718 {
719 pChilds = &m_boundaryTree.childList();
720 it = pChilds->begin();
721 const char *w = bit->c_str();
722 do
723 {
724 it = find_if(pChilds->begin(), pChilds->end(),
725 FindNodePred<char>(*w));
726 if( it == pChilds->end() )
727 it = pChilds->insert(pChilds->end(),*w);
728 pChilds = &it->childList();
729 depth++;
730 } while(*(++w));
731 }
732 }
733
734 };
735
736 }
737
738 #endif

  ViewVC Help
Powered by ViewVC 1.1.26