ANTLR Support Libraries 2.7.1+
antlr/CharScanner.hpp
Go to the documentation of this file.
00001 #ifndef INC_CharScanner_hpp__
00002 #define INC_CharScanner_hpp__
00003 
00004 /* ANTLR Translator Generator
00005  * Project led by Terence Parr at http://www.jGuru.com
00006  * Software rights: http://www.antlr.org/license.html
00007  *
00008  * $Id: //depot/code/org.antlr/release/antlr-2.7.7/lib/cpp/antlr/CharScanner.hpp#2 $
00009  */
00010 
00011 #include <antlr/config.hpp>
00012 
00013 #include <map>
00014 #include <strings.h>
00015 #include <cstdio>
00016 
00017 #ifdef HAS_NOT_CCTYPE_H
00018 #include <ctype.h>
00019 #else
00020 #include <cctype>
00021 #endif
00022 
00023 #if ( _MSC_VER == 1200 )
00024 // VC6 seems to need this
00025 // note that this is not a standard C++ include file.
00026 # include <stdio.h>
00027 #endif
00028 
00029 #include <antlr/TokenStream.hpp>
00030 #include <antlr/RecognitionException.hpp>
00031 #include <antlr/SemanticException.hpp>
00032 #include <antlr/MismatchedCharException.hpp>
00033 #include <antlr/InputBuffer.hpp>
00034 #include <antlr/BitSet.hpp>
00035 #include <antlr/LexerSharedInputState.hpp>
00036 
00037 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
00038 namespace antlr {
00039 #endif
00040 
00041 class ANTLR_API CharScanner;
00042 
00043 ANTLR_C_USING(tolower)
00044 
00045 #ifdef ANTLR_REALLY_NO_STRCASECMP
00046 // Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior
00047 // on the mac has neither...
00048 inline int strcasecmp(const char *s1, const char *s2)
00049 {
00050    while (true)
00051    {
00052       char  c1 = tolower(*s1++),
00053             c2 = tolower(*s2++);
00054       if (c1 < c2) return -1;
00055       if (c1 > c2) return 1;
00056       if (c1 == 0) return 0;
00057    }
00058 }
00059 #else
00060 #ifdef NO_STRCASECMP
00061 ANTLR_C_USING(stricmp)
00062 #else
00063 ANTLR_C_USING(strcasecmp)
00064 #endif
00065 #endif
00066 
00069 class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> {
00070 private:
00071    const CharScanner* scanner;
00072 public:
00073 #ifdef NO_TEMPLATE_PARTS
00074    CharScannerLiteralsLess() {} // not really used, definition to appease MSVC
00075 #endif
00076    CharScannerLiteralsLess(const CharScanner* theScanner)
00077    : scanner(theScanner)
00078    {
00079    }
00080    bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const;
00081 // defaults are good enough..
00082    // CharScannerLiteralsLess(const CharScannerLiteralsLess&);
00083    // CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&);
00084 };
00085 
00088 class ANTLR_API CharScanner : public TokenStream {
00089 protected:
00090    typedef RefToken (*factory_type)();
00091 public:
00092    CharScanner(InputBuffer& cb, bool case_sensitive );
00093    CharScanner(InputBuffer* cb, bool case_sensitive );
00094    CharScanner(const LexerSharedInputState& state, bool case_sensitive );
00095 
00096    virtual ~CharScanner()
00097    {
00098    }
00099 
00100    virtual int LA(unsigned int i);
00101 
00102    virtual void append(char c)
00103    {
00104       if (saveConsumedInput)
00105       {
00106          size_t l = text.length();
00107 
00108          if ((l%256) == 0)
00109             text.reserve(l+256);
00110 
00111          text.replace(l,0,&c,1);
00112       }
00113    }
00114 
00115    virtual void append(const ANTLR_USE_NAMESPACE(std)string& s)
00116    {
00117       if( saveConsumedInput )
00118          text += s;
00119    }
00120 
00121    virtual void commit()
00122    {
00123       inputState->getInput().commit();
00124    }
00125 
00129    virtual void recover(const RecognitionException& ex, const BitSet& tokenSet)
00130    {
00131       consume();
00132       consumeUntil(tokenSet);
00133    }
00134 
00135    virtual void consume()
00136    {
00137       if (inputState->guessing == 0)
00138       {
00139          int c = LA(1);
00140          if (caseSensitive)
00141          {
00142             append(c);
00143          }
00144          else
00145          {
00146             // use input.LA(), not LA(), to get original case
00147             // CharScanner.LA() would toLower it.
00148             append(inputState->getInput().LA(1));
00149          }
00150 
00151          // RK: in a sense I don't like this automatic handling.
00152          if (c == '\t')
00153             tab();
00154          else
00155             inputState->column++;
00156       }
00157       inputState->getInput().consume();
00158    }
00159 
00161    virtual void consumeUntil(int c)
00162    {
00163       for(;;)
00164       {
00165          int la_1 = LA(1);
00166          if( la_1 == EOF_CHAR || la_1 == c )
00167             break;
00168          consume();
00169       }
00170    }
00171 
00173    virtual void consumeUntil(const BitSet& set)
00174    {
00175       for(;;)
00176       {
00177          int la_1 = LA(1);
00178          if( la_1 == EOF_CHAR || set.member(la_1) )
00179             break;
00180          consume();
00181       }
00182    }
00183 
00185    virtual unsigned int mark()
00186    {
00187       return inputState->getInput().mark();
00188    }
00190    virtual void rewind(unsigned int pos)
00191    {
00192       inputState->getInput().rewind(pos);
00193    }
00194 
00196    virtual void match(int c)
00197    {
00198       int la_1 = LA(1);
00199       if ( la_1 != c )
00200          throw MismatchedCharException(la_1, c, false, this);
00201       consume();
00202    }
00203 
00207    virtual void match(const BitSet& b)
00208    {
00209       int la_1 = LA(1);
00210 
00211       if ( !b.member(la_1) )
00212          throw MismatchedCharException( la_1, b, false, this );
00213       consume();
00214    }
00215 
00219    virtual void match( const char* s )
00220    {
00221       while( *s != '\0' )
00222       {
00223          // the & 0xFF is here to prevent sign extension lateron
00224          int la_1 = LA(1), c = (*s++ & 0xFF);
00225 
00226          if ( la_1 != c )
00227             throw MismatchedCharException(la_1, c, false, this);
00228 
00229          consume();
00230       }
00231    }
00235    virtual void match(const ANTLR_USE_NAMESPACE(std)string& s)
00236    {
00237       size_t len = s.length();
00238 
00239       for (size_t i = 0; i < len; i++)
00240       {
00241          // the & 0xFF is here to prevent sign extension lateron
00242          int la_1 = LA(1), c = (s[i] & 0xFF);
00243 
00244          if ( la_1 != c )
00245             throw MismatchedCharException(la_1, c, false, this);
00246 
00247          consume();
00248       }
00249    }
00253    virtual void matchNot(int c)
00254    {
00255       int la_1 = LA(1);
00256 
00257       if ( la_1 == c )
00258          throw MismatchedCharException(la_1, c, true, this);
00259 
00260       consume();
00261    }
00265    virtual void matchRange(int c1, int c2)
00266    {
00267       int la_1 = LA(1);
00268 
00269       if ( la_1 < c1 || la_1 > c2 )
00270          throw MismatchedCharException(la_1, c1, c2, false, this);
00271 
00272       consume();
00273    }
00274 
00275    virtual bool getCaseSensitive() const
00276    {
00277       return caseSensitive;
00278    }
00279 
00280    virtual void setCaseSensitive(bool t)
00281    {
00282       caseSensitive = t;
00283    }
00284 
00285    virtual bool getCaseSensitiveLiterals() const=0;
00286 
00288    virtual int getLine() const
00289    {
00290       return inputState->line;
00291    }
00292 
00294    virtual void setLine(int l)
00295    {
00296       inputState->line = l;
00297    }
00298 
00300    virtual int getColumn() const
00301    {
00302       return inputState->column;
00303    }
00305    virtual void setColumn(int c)
00306    {
00307       inputState->column = c;
00308    }
00309 
00311    virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const
00312    {
00313       return inputState->filename;
00314    }
00316    virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f)
00317    {
00318       inputState->filename = f;
00319    }
00320 
00321    virtual bool getCommitToPath() const
00322    {
00323       return commitToPath;
00324    }
00325 
00326    virtual void setCommitToPath(bool commit)
00327    {
00328       commitToPath = commit;
00329    }
00330 
00332    virtual const ANTLR_USE_NAMESPACE(std)string& getText() const
00333    {
00334       return text;
00335    }
00336 
00337    virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s)
00338    {
00339       text = s;
00340    }
00341 
00342    virtual void resetText()
00343    {
00344       text = "";
00345       inputState->tokenStartColumn = inputState->column;
00346       inputState->tokenStartLine = inputState->line;
00347    }
00348 
00349    virtual RefToken getTokenObject() const
00350    {
00351       return _returnToken;
00352    }
00353 
00357    virtual void newline()
00358    {
00359       ++inputState->line;
00360       inputState->column = 1;
00361    }
00362 
00367    virtual void tab()
00368    {
00369       int c = getColumn();
00370       int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1;      // calculate tab stop
00371       setColumn( nc );
00372    }
00374    int setTabsize( int size )
00375    {
00376       int oldsize = tabsize;
00377       tabsize = size;
00378       return oldsize;
00379    }
00381    int getTabSize() const
00382    {
00383       return tabsize;
00384    }
00385 
00387    virtual void reportError(const RecognitionException& e);
00388 
00390    virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s);
00391 
00393    virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s);
00394 
00395    virtual InputBuffer& getInputBuffer()
00396    {
00397       return inputState->getInput();
00398    }
00399 
00400    virtual LexerSharedInputState getInputState()
00401    {
00402       return inputState;
00403    }
00404 
00407    virtual void setInputState(LexerSharedInputState state)
00408    {
00409       inputState = state;
00410    }
00411 
00413    virtual void setTokenObjectFactory(factory_type factory)
00414    {
00415       tokenFactory = factory;
00416    }
00417 
00421    virtual int testLiteralsTable(int ttype) const
00422    {
00423       ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text);
00424       if (i != literals.end())
00425          ttype = (*i).second;
00426       return ttype;
00427    }
00428 
00434    virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const
00435    {
00436       ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt);
00437       if (i != literals.end())
00438          ttype = (*i).second;
00439       return ttype;
00440    }
00441 
00443    virtual int toLower(int c) const
00444    {
00445       // test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?)
00446       // also VC++ 6.0 does this. (see fix 422 (is reverted by this fix)
00447       // this one is more structural. Maybe make this configurable.
00448       return (c == EOF_CHAR ? EOF_CHAR : tolower(c));
00449    }
00450 
00466    virtual void uponEOF()
00467    {
00468    }
00469 
00471    virtual void traceIndent();
00472    virtual void traceIn(const char* rname);
00473    virtual void traceOut(const char* rname);
00474 
00475 #ifndef NO_STATIC_CONSTS
00476    static const int EOF_CHAR = EOF;
00477 #else
00478    enum {
00479       EOF_CHAR = EOF
00480    };
00481 #endif
00482 protected:
00483    ANTLR_USE_NAMESPACE(std)string text; 
00484 
00485    bool saveConsumedInput;
00486    factory_type tokenFactory;          
00487    bool caseSensitive;                 
00488    ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass
00489 
00490    RefToken _returnToken;     
00491 
00493    LexerSharedInputState inputState;
00494 
00499    bool commitToPath;
00500 
00501    int tabsize;   
00502 
00504    virtual RefToken makeToken(int t)
00505    {
00506       RefToken tok = tokenFactory();
00507       tok->setType(t);
00508       tok->setColumn(inputState->tokenStartColumn);
00509       tok->setLine(inputState->tokenStartLine);
00510       return tok;
00511    }
00512 
00515    class Tracer {
00516    private:
00517       CharScanner* parser;
00518       const char* text;
00519 
00520       Tracer(const Tracer& other);              // undefined
00521       Tracer& operator=(const Tracer& other);   // undefined
00522    public:
00523       Tracer( CharScanner* p,const char* t )
00524       : parser(p), text(t)
00525       {
00526          parser->traceIn(text);
00527       }
00528       ~Tracer()
00529       {
00530          parser->traceOut(text);
00531       }
00532    };
00533 
00534    int traceDepth;
00535 private:
00536    CharScanner( const CharScanner& other );              // undefined
00537    CharScanner& operator=( const CharScanner& other );   // undefined
00538 
00539 #ifndef NO_STATIC_CONSTS
00540    static const int NO_CHAR = 0;
00541 #else
00542    enum {
00543       NO_CHAR = 0
00544    };
00545 #endif
00546 };
00547 
00548 inline int CharScanner::LA(unsigned int i)
00549 {
00550    int c = inputState->getInput().LA(i);
00551 
00552    if ( caseSensitive )
00553       return c;
00554    else
00555       return toLower(c);   // VC 6 tolower bug caught in toLower.
00556 }
00557 
00558 inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const
00559 {
00560    if (scanner->getCaseSensitiveLiterals())
00561       return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y);
00562    else
00563    {
00564 #ifdef NO_STRCASECMP
00565       return (stricmp(x.c_str(),y.c_str())<0);
00566 #else
00567       return (strcasecmp(x.c_str(),y.c_str())<0);
00568 #endif
00569    }
00570 }
00571 
00572 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
00573 }
00574 #endif
00575 
00576 #endif //INC_CharScanner_hpp__
 All Classes Namespaces Files Functions Variables Typedefs Friends Defines