Synopsis - Synopsis/Parser/C++/occ/token.cc

    1| /*
    2|   Copyright (C) 1997-2000 Shigeru Chiba, University of Tsukuba.
    3|
    4|   Permission to use, copy, distribute and modify this software and
    5|   its documentation for any purpose is hereby granted without fee,
    6|   provided that the above copyright notice appear in all copies and that
    7|   both that copyright notice and this permission notice appear in
    8|   supporting documentation.
    9|
   10|   Shigeru Chiba makes no representations about the suitability of this
   11|   software for any purpose.  It is provided "as is" without express or
   12|   implied warranty.
   13| */
   14| /*
   15|   Copyright (c) 1995, 1996 Xerox Corporation.
   16|   All Rights Reserved.
   17|
   18|   Use and copying of this software and preparation of derivative works
   19|   based upon this software are permitted. Any copy of this software or
   20|   of any derivative work must include the above copyright notice of
   21|   Xerox Corporation, this paragraph and the one after it.  Any
   22|   distribution of this software or derivative works must comply with all
   23|   applicable United States export control laws.
   24|
   25|   This software is made available AS IS, and XEROX CORPORATION DISCLAIMS
   26|   ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE
   27|   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   28|   PURPOSE, AND NOTWITHSTANDING ANY OTHER PROVISION CONTAINED HEREIN, ANY
   29|   LIABILITY FOR DAMAGES RESULTING FROM THE SOFTWARE OR ITS USE IS
   30|   EXPRESSLY DISCLAIMED, WHETHER ARISING IN CONTRACT, TORT (INCLUDING
   31|   NEGLIGENCE) OR STRICT LIABILITY, EVEN IF XEROX CORPORATION IS ADVISED
   32|   OF THE POSSIBILITY OF SUCH DAMAGES.
   33| */
   34|
   35| #include <cstdlib>
   36| #include <cstring>
   37| #include <iostream>
   38| #include "token.h"
   39| #include "hash.h"
   40| #include "ptree.h"
   41| #include "buffer.h"
   42|
   43| #if defined(_PARSE_VCC)
   44| #define _MSC_VER
   45| #endif
   46|
   47| #if defined(_MSC_VER)
   48| #include <assert.h>            // for assert in InitializeO
   49| #endif
   50|
   51| extern BOOL regularCpp;        // defined in main.cc
   52| static void InitializeOtherKeywords();
   53|
   54| #ifdef TEST
   55|
   56| #ifdef __GNUG__
   57| #define token(x)
   58| #else
   59| #define token(x)        (
   60| #endif
   61|
   62| #else
   63|
   64| #define token(x)
   65|
   66| #endif
   67|
   68| // class Lex
   69|
   70| HashTable* Lex::user_keywords = nil;
   71| Ptree* Lex::comments = nil;
   72|
   73| Lex::Lex(Program* prog) : fifo(this)
   74| {
   75|     file = prog;
   76|     prog->Rewind();
   77|     last_token = '\n';
   78|     tokenp = 0;
   79|     token_len = 0;
   80|
   81|     // Re-init incase used multiple times by Synopsis
   82|     comments = nil;
   83|     user_keywords = nil;
   84|
   85|     InitializeOtherKeywords();
   86| }
   87|
   88| char* Lex::Save()
   89| {
   90|     char* pos;
   91|     int len;
   92|
   93|     fifo.Peek(0, pos, len);
   94|     return pos;
   95| }
   96|
   97| void Lex::Restore(char* pos)
   98| {
   99|     last_token = '\n';
  100|     tokenp = 0;
  101|     token_len = 0;
  102|     fifo.Clear();
  103|     Rewind(pos);
  104| }
  105|
  106| // ">>" is either the shift operator or double closing brackets.
  107|
  108| void Lex::GetOnlyClosingBracket(Token& t)
  109| {
  110|     Restore(t.ptr + 1);
  111| }
  112|
  113| uint Lex::LineNumber(char* pos, char*& ptr, int& len)
  114| {
  115|     return file->LineNumber(pos, ptr, len);
  116| }
  117|
  118| int Lex::GetToken(Token& t)
  119| {
  120|     t.kind = fifo.Pop(t.ptr, t.len);
  121|     return t.kind;
  122| }
  123|
  124| int Lex::LookAhead(int offset)
  125| {
  126|     return fifo.Peek(offset);
  127| }
  128|
  129| int Lex::LookAhead(int offset, Token& t)
  130| {
  131|     t.kind = fifo.Peek(offset, t.ptr, t.len);
  132|     return t.kind;
  133| }
  134|
  135| char* Lex::TokenPosition()
  136| {
  137|     return (char*)file->Read(Tokenp());
  138| }
  139|
  140| char Lex::Ref(uint i)
  141| {
  142|     return file->Ref(i);
  143| }
  144|
  145| void Lex::Rewind(char* p)
  146| {
  147|     file->Rewind(p - file->Read(0));
  148| }
  149|
  150| bool Lex::RecordKeyword(char* keyword, int token)
  151| {
  152|     int index;
  153|     char* str;
  154|
  155|     if(keyword == nil)
  156|         return FALSE;
  157|
  158|     str = new(GC) char[strlen(keyword) + 1];
  159|     strcpy(str, keyword);
  160|
  161|     if(user_keywords == nil)
  162|         user_keywords = new HashTable;
  163|
  164|     if(user_keywords->AddEntry(str, (HashValue)token, &index) >= 0)
  165|         return TRUE;
  166|     else
  167|         return bool(user_keywords->Peek(index) == (HashValue)token);
  168| }
  169|
  170| bool Lex::Reify(Ptree* t, unsigned int& value)
  171| {
  172|     if(t == nil || !t->IsLeaf())
  173|         return FALSE;
  174|
  175|     char* p = t->GetPosition();
  176|     int len = t->GetLength();
  177|     value = 0;
  178|     if(len > 2 && *p == '0' && is_xletter(p[1])){
  179|         for(int i = 2; i < len; ++i){
  180|             char c = p[i];
  181|             if(is_digit(c))
  182|                value = value * 0x10 + (c - '0');
  183|             else if('A' <= c && c <= 'F')
  184|                value = value * 0x10 + (c - 'A' + 10);
  185|             else if('a' <= c && c <= 'f')
  186|                value = value * 0x10 + (c - 'a' + 10);
  187|             else if(is_int_suffix(c))
  188|         break;
  189|         else
  190|                return FALSE;
  191|         }
  192|
  193|         return TRUE;
  194|     }
  195|     else if(len > 0 && is_digit(*p)){
  196|         for(int i = 0; i < len; ++i){
  197|             char c = p[i];
  198|             if(is_digit(c))
  199|                value = value * 10 + c - '0';
  200|             else if(is_int_suffix(c))
  201|         break;
  202|         else
  203|                return FALSE;
  204|         }
  205|
  206|         return TRUE;
  207|     }
  208|     else
  209|         return FALSE;
  210| }
  211|
  212| // Reify() doesn't interpret an escape character.
  213|
  214| bool Lex::Reify(Ptree* t, char*& str)
  215| {
  216|     if(t == nil || !t->IsLeaf())
  217|         return FALSE;
  218|
  219|     char* p = t->GetPosition();
  220|     int length = t->GetLength();
  221|     if(*p != '"')
  222|         return FALSE;
  223|     else{
  224|         str = new(GC) char[length];
  225|         char* sp = str;
  226|         for(int i = 1; i < length; ++i)
  227|             if(p[i] != '"'){
  228|                *sp++ = p[i];
  229|                if(p[i] == '\\' && i + 1 < length)
  230|                    *sp++ = p[++i];
  231|         }
  232|         else
  233|                while(++i < length && p[i] != '"')
  234|         ;
  235|
  236|         *sp = '\0';
  237|         return TRUE;
  238|     }
  239| }
  240|
  241| // class TokenFifo
  242|
  243| Lex::TokenFifo::TokenFifo(Lex* l)
  244| {
  245|     lex = l;
  246|     size = 16;
  247|     ring = new (GC) Slot[size];
  248|     head = tail = 0;
  249| }
  250|
  251| Lex::TokenFifo::~TokenFifo()
  252| {
  253|     // delete [] ring;
  254| }
  255|
  256| void Lex::TokenFifo::Clear()
  257| {
  258|     head = tail = 0;
  259| }
  260|
  261| void Lex::TokenFifo::Push(int token, char* pos, int len)
  262| {
  263|     const int Plus = 16;
  264|     ring[head].token = token;
  265|     ring[head].pos = pos;
  266|     ring[head].len = len;
  267|     head = (head + 1) % size;
  268|     if(head == tail){
  269|         Slot* ring2 = new (GC) Slot[size + Plus];
  270|         int i = 0;
  271|         do{
  272|             ring2[i++] = ring[tail];
  273|             tail = (tail + 1) % size;
  274|         } while(head != tail);
  275|         head = i;
  276|         tail = 0;
  277|         size += Plus;
  278|         // delete [] ring;
  279|         ring = ring2;
  280|     }
  281| }
  282|
  283| int Lex::TokenFifo::Pop(char*& pos, int& len)
  284| {
  285|     if(head == tail)
  286|         return lex->ReadToken(pos, len);
  287|
  288|     int t = ring[tail].token;
  289|     pos = ring[tail].pos;
  290|     len = ring[tail].len;
  291|     tail = (tail + 1) % size;
  292|     return t;
  293| }
  294|
  295| int Lex::TokenFifo::Peek(int offset)
  296| {
  297|     return ring[Peek2(offset)].token;
  298| }
  299|
  300| int Lex::TokenFifo::Peek(int offset, char*& pos, int& len)
  301| {
  302|     int cur = Peek2(offset);
  303|     pos = ring[cur].pos;
  304|     len = ring[cur].len;
  305|     return ring[cur].token;
  306| }
  307|
  308| int Lex::TokenFifo::Peek2(int offset)
  309| {
  310|     int i;
  311|     int cur = tail;
  312|
  313|     for(i = 0; i <= offset; ++i){
  314|         if(head == cur){
  315|             while(i++ <= offset){
  316|                char* p;
  317|                int   l;
  318|                int t = lex->ReadToken(p, l);
  319|                Push(t, p, l);
  320|         }
  321|
  322|           break;
  323|         }
  324|
  325|         cur = (cur + 1) % size;
  326|     }
  327|
  328|     return (tail + offset) % size;
  329| }
  330|
  331| /*
  332|   Lexical Analyzer
  333| */
  334|
  335| int Lex::ReadToken(char*& ptr, int& len)
  336| {
  337|     int t;
  338|
  339|     for(;;){
  340|         t = ReadLine();
  341|
  342|         if(t == Ignore)
  343|             continue;
  344|
  345|         last_token = t;
  346|
  347| #if defined(__GNUG__) || defined(_GNUG_SYNTAX)
  348|         if(t == ATTRIBUTE){
  349|             SkipAttributeToken();
  350|             continue;
  351|         }
  352|         else if(t == EXTENSION){
  353|             t = SkipExtensionToken(ptr, len);
  354|             if(t == Ignore)
  355|                continue;
  356|         else
  357|                return t;
  358|         }
  359| #endif
  360| #if defined(_MSC_VER)
  361|         if(t == ASM){
  362|             SkipAsmToken();
  363|             continue;
  364|         }
  365|         else if(t == DECLSPEC){
  366|             SkipDeclspecToken();
  367|             continue;
  368|         }
  369| #endif
  370|         if(t != '\n')
  371|           break;
  372|     }
  373|
  374|     ptr = TokenPosition();
  375|     len = TokenLen();
  376|     return t;
  377| }
  378|
  379| //   SkipAttributeToken() skips __attribute__(...), ___asm__(...), ...
  380|
  381| void Lex::SkipAttributeToken()
  382| {
  383|     char c;
  384|
  385|     do{
  386|         c = file->Get();
  387|     }while(c != '(' && c != '\0');
  388|
  389|     int i = 1;
  390|     do{
  391|         c = file->Get();
  392|         if(c == '(')
  393|         ++i;
  394|         else if(c == ')')
  395|         --i;
  396|         else if(c == '\0')
  397|           break;
  398|     } while(i > 0);
  399| }
  400|
  401| // SkipExtensionToken() skips __extension__(...).
  402|
  403| int Lex::SkipExtensionToken(char*& ptr, int& len)
  404| {
  405|     ptr = TokenPosition();
  406|     len = TokenLen();
  407|
  408|     char c;
  409|
  410|     do{
  411|         c = file->Get();
  412|     }while(is_blank(c) || c == '\n');
  413|
  414|     if(c != '('){
  415|         file->Unget();
  416|         return Ignore;        // if no (..) follows, ignore __extension__
  417|     }
  418|
  419|     int i = 1;
  420|     do{
  421|         c = file->Get();
  422|         if(c == '(')
  423|         ++i;
  424|         else if(c == ')')
  425|         --i;
  426|         else if(c == '\0')
  427|           break;
  428|     } while(i > 0);
  429|
  430|     return Identifier;  // regards it as the identifier __extension__
  431| }
  432|
  433| #if defined(_MSC_VER)
  434|
  435| #define CHECK_END_OF_INSTRUCTION(C, EOI) \
  436|         if (C == '\0') return; \
  437|         if (strchr(EOI, C)) { \
  438|             this->file->Unget(); \
  439|             return; \
  440|         }
  441|
  442| /* SkipAsmToken() skips __asm ...
  443|    You can have the following :
  444|
  445|    Just count the '{' and '}' and it should be ok
  446|    __asm { mov ax,1
  447|            mov bx,1 }
  448|
  449|    Stop when EOL found. Note that the first ';' after
  450|    an __asm instruction is an ASM comment !
  451|    int v; __asm mov ax,1 __asm mov bx,1; v=1;
  452|
  453|    Stop when '}' found
  454|    if (cond) {__asm mov ax,1 __asm mov bx,1}
  455|
  456|    and certainly more...
  457| */
  458| void Lex::SkipAsmToken()
  459| {
  460|     char c;
  461|
  462|     do{
  463|         c = file->Get();
  464|         CHECK_END_OF_INSTRUCTION(c, "");
  465|     }while(is_blank(c) || c == '\n');
  466|
  467|     if(c == '{'){
  468|         int i = 1;
  469|         do{
  470|             c = file->Get();
  471|             CHECK_END_OF_INSTRUCTION(c, "");
  472|             if(c == '{')
  473|             ++i;
  474|             else if(c == '}')
  475|             --i;
  476|         } while(i > 0);
  477|     }
  478|     else{
  479|         for(;;){
  480|             CHECK_END_OF_INSTRUCTION(c, "}\n");
  481|             c = file->Get();
  482|         }
  483|     }
  484| }
  485|
  486| //   SkipDeclspecToken() skips __declspec(...).
  487|
  488| void Lex::SkipDeclspecToken()
  489| {
  490|     char c;
  491|
  492|     do{
  493|         c = file->Get();
  494|         CHECK_END_OF_INSTRUCTION(c, "");
  495|     }while(is_blank(c));
  496|
  497|     if (c == '(') {
  498|         int i = 1;
  499|         do{
  500|             c = file->Get();
  501|             CHECK_END_OF_INSTRUCTION(c, "};");
  502|             if(c == '(')
  503|             ++i;
  504|             else if(c == ')')
  505|             --i;
  506|         }while(i > 0);
  507|     }
  508| }
  509|
  510| #undef CHECK_END_OF_INSTRUCTION
  511|
  512| #endif /* _MSC_VER */
  513|
  514| char Lex::GetNextNonWhiteChar()
  515| {
  516|     char c;
  517|
  518|     for(;;){
  519|         do{
  520|             c = file->Get();
  521|         }while(is_blank(c));
  522|
  523|         if(c != '\\')
  524|           break;
  525|
  526|         c = file->Get();
  527|         if(c != '\n' && c!= '\r') {
  528|             file->Unget();
  529|           break;
  530|         }
  531|     }
  532|
  533|     return c;
  534| }
  535|
  536| int Lex::ReadLine()
  537| {
  538|     char c;
  539|     uint top;
  540|
  541|     c = GetNextNonWhiteChar();
  542|
  543|     tokenp = top = file->GetCurPos();
  544|     if(c == '\0'){
  545|         file->Unget();
  546|         return '\0';
  547|     }
  548|     else if(c == '\n')
  549|         return '\n';
  550|     else if(c == '#' && last_token == '\n'){
  551|         if(ReadLineDirective())
  552|             return '\n';
  553|         else{
  554|             file->Rewind(top + 1);
  555|             token_len = 1;
  556|             return SingleCharOp(c);
  557|         }
  558|     }
  559|     else if(c == '\'' || c == '"'){
  560|         if(c == '\''){
  561|             if(ReadCharConst(top))
  562|                return token(CharConst);
  563|         }
  564|         else{
  565|             if(ReadStrConst(top))
  566|                return token(StringL);
  567|         }
  568|
  569|         file->Rewind(top + 1);
  570|         token_len = 1;
  571|         return SingleCharOp(c);
  572|     }
  573|     else if(is_digit(c))
  574|         return ReadNumber(c, top);
  575|     else if(c == '.'){
  576|         c = file->Get();
  577|         if(is_digit(c))
  578|             return ReadFloat(top);
  579|         else{
  580|             file->Unget();
  581|             return ReadSeparator('.', top);
  582|         }
  583|     }
  584|     else if(is_letter(c)) {
  585|         if (c == 'L') {
  586|             // May be a L"const" type string
  587|             char next = file->Get();
  588|             if (next == '"') {
  589|                if (ReadStrConst(top))
  590|                    return token(StringL);
  591|         }
  592|             file->Unget();
  593|         }
  594|         return ReadIdentifier(top);
  595|     } else
  596|         return ReadSeparator(c, top);
  597| }
  598|
  599| bool Lex::ReadCharConst(uint top)
  600| {
  601|     char c;
  602|
  603|     for(;;){
  604|         c = file->Get();
  605|         if(c == '\\'){
  606|             c = file->Get();
  607|             if(c == '\0')
  608|                return FALSE;
  609|         }
  610|         else if(c == '\''){
  611|             token_len = int(file->GetCurPos() - top + 1);
  612|             return TRUE;
  613|         }
  614|         else if(c == '\n' || c == '\0')
  615|             return FALSE;
  616|     }
  617| }
  618|
  619| /*
  620|   If text is a sequence of string constants like:
  621|         "string1" "string2"  L"string3"
  622|   then the string constants are delt with as a single constant.
  623| */
  624| bool Lex::ReadStrConst(uint top)
  625| {
  626|     char c;
  627|
  628|     // Skip the L if there is one
  629|     if (*file->Read(top) == 'L')
  630|         file->Get();
  631|
  632|     for(;;){
  633|         c = file->Get();
  634|         if(c == '\\'){
  635|             c = file->Get();
  636|             if(c == '\0')
  637|                return FALSE;
  638|         }
  639|         else if(c == '"'){
  640|             uint pos = file->GetCurPos() + 1;
  641|             int nline = 0;
  642|         do{
  643|                c = file->Get();
  644|                if(c == '\n')
  645|                ++nline;
  646|             } while(is_blank(c) || c == '\n');
  647|
  648|             if(c == '"')
  649|                /* line_number += nline; */ ;
  650|          else{
  651|                token_len = int(pos - top);
  652|                file->Rewind(pos);
  653|                return TRUE;
  654|         }
  655|         }
  656|         else if(c == '\n' || c == '\0')
  657|             return FALSE;
  658|     }
  659| }
  660|
  661| int Lex::ReadNumber(char c, uint top)
  662| {
  663|     char c2 = file->Get();
  664|
  665|     if(c == '0' && is_xletter(c2)){
  666|         do{
  667|             c = file->Get();
  668|         } while(is_hexdigit(c));
  669|         while(is_int_suffix(c))
  670|             c = file->Get();
  671|
  672|         file->Unget();
  673|         token_len = int(file->GetCurPos() - top + 1);
  674|         return token(Constant);
  675|     }
  676|
  677|     while(is_digit(c2))
  678|         c2 = file->Get();
  679|
  680|     if(is_int_suffix(c2))
  681|         do{
  682|             c2 = file->Get();
  683|         }while(is_int_suffix(c2));
  684|     else if(c2 == '.')
  685|         return ReadFloat(top);
  686|     else if(is_eletter(c2)){
  687|         file->Unget();
  688|         return ReadFloat(top);
  689|     }
  690|
  691|     file->Unget();
  692|     token_len = int(file->GetCurPos() - top + 1);
  693|     return token(Constant);
  694| }
  695|
  696| int Lex::ReadFloat(uint top)
  697| {
  698|     char c;
  699|
  700|     do{
  701|         c = file->Get();
  702|     }while(is_digit(c));
  703|     if(is_float_suffix(c))
  704|         do{
  705|             c = file->Get();
  706|         }while(is_float_suffix(c));
  707|     else if(is_eletter(c)){
  708|         uint p = file->GetCurPos();
  709|         c = file->Get();
  710|         if(c == '+' || c == '-'){
  711|              c = file->Get();
  712|              if(!is_digit(c)){
  713|                file->Rewind(p);
  714|                token_len = int(p - top);
  715|                return token(Constant);
  716|         }
  717|         }
  718|         else if(!is_digit(c)){
  719|             file->Rewind(p);
  720|             token_len = int(p - top);
  721|             return token(Constant);
  722|         }
  723|
  724|         do{
  725|             c = file->Get();
  726|         }while(is_digit(c));
  727|
  728|         while(is_float_suffix(c))
  729|             c = file->Get();
  730|     }
  731|
  732|     file->Unget();
  733|     token_len = int(file->GetCurPos() - top + 1);
  734|     return token(Constant);
  735| }
  736|
  737| // ReadLineDirective() simply ignores a line beginning with '#'
  738|
  739| bool Lex::ReadLineDirective()
  740| {
  741|     char c;
  742|
  743|     do{
  744|         c = file->Get();
  745|     }while(c != '\n' && c != '\0');
  746|     return TRUE;
  747| }
  748|
  749| int Lex::ReadIdentifier(uint top)
  750| {
  751|     char c;
  752|
  753|     do{
  754|         c = file->Get();
  755|     }while(is_letter(c) || is_digit(c));
  756|
  757|     uint len = file->GetCurPos() - top;
  758|     token_len = int(len);
  759|     file->Unget();
  760|
  761|     return Screening((char*)file->Read(top), int(len));
  762| }
  763|
  764| /*
  765|   This table is a list of reserved key words.
  766|   Note: alphabetical order!
  767| */
  768| static struct rw_table {
  769|     char*       name;
  770|     long        value;
  771| } table[] = {
  772| #if defined(__GNUG__) || defined(_GNUG_SYNTAX)
  773|     { "__alignof__",    token(SIZEOF
  774|     { "__asm__",        token(ATTRI
  775|     { "__attribute__",  token(ATTRIBUTE)
  776|         { "__complex__",token(Ignore) },
  777|     { "__const",        token(C
  778|     { "__extension__",  token(EXTENSION)
  779|     { "__imag__",       token(Ign
  780|     { "__inline__",     token(INLIN
  781|     { "__real__",       token(Ign
  782|     { "__restrict",     token(Ignor
  783|     { "__restrict__",   token(Ignore)
  784|     { "__signed",       token(SIG
  785|     { "__signed__",     token(SIGNE
  786|     { "__typeof",       token(TYP
  787|     { "__typeof__",     token(TYPEO
  788| #endif
  789|     { "asm",           token(AT
  790|     { "auto",          toke
  791| #if !defined(_MSC_VER) || (_MSC_VER >= 1100)
  792|     { "bool",          token(B
  793| #endif
  794|     { "break",         token(
  795|     { "case",          toke
  796|     { "catch",         token(
  797|     { "char",          toke
  798|     { "class",         token(
  799|     { "const",         token(
  800|     { "continue",       token(CONTI
  801|     { "default",        token(DEF
  802|     { "delete",        token(DE
  803|     { "do",
  804|     { "double",        token(DO
  805|     { "else",          toke
  806|     { "enum",          toke
  807|     { "extern",        token(EX
  808|     { "float",         token(
  809|     { "for",           to
  810|     { "friend",        token(FR
  811|     { "goto",          toke
  812|     { "if",
  813|     { "inline",        token(IN
  814|     { "int",           to
  815|     { "long",          toke
  816|     { "metaclass",      token(METACLASS) },    // OpenC++
  817|     { "mutable",        token(MUT
  818|     { "namespace",      token(NAMESPA
  819|     { "new",           to
  820|     { "operator",       token(OPERA
  821|     { "private",        token(PRI
  822|     { "protected",      token(PROTECT
  823|     { "public",        token(PU
  824|     { "register",       token(REGIS
  825|     { "return",        token(RE
  826|     { "short",         token(
  827|     { "signed",        token(SI
  828|     { "sizeof",        token(SI
  829|     { "static",        token(ST
  830|     { "struct",        token(ST
  831|     { "switch",        token(SW
  832|     { "template",       token(TEMPL
  833|     { "this",          toke
  834|     { "throw",         token(
  835|     { "try",           to
  836|     { "typedef",        token(TYP
  837|     { "typeid",        token(TY
  838|     { "typename",       token(CLASS) },        // it's not identical to class, but...
  839|     { "union",         token(
  840|     { "unsigned",       token(UNSIG
  841|     { "using",         token(
  842|     { "virtual",        token(VIR
  843|     { "void",          toke
  844|     { "volatile",       token(VOLAT
  845|     { "while",         token(
  846|     /* NULL slot */
  847| };
  848|
  849| static void InitializeOtherKeywords()
  850| {
  851|     static BOOL done = FALSE;
  852|
  853|     if(done)
  854|         return;
  855|     else
  856|         done = TRUE;
  857|
  858|     if(regularCpp)
  859|         for(unsigned int i = 0; i < sizeof(table) / sizeof(table[0]); ++i)
  860|             if(table[i].value == METACLASS){
  861|                table[i].value = Identifier;
  862|         break;
  863|         }
  864|
  865| #if defined(_MSC_VER)
  866|     assert(Lex::RecordKeyword("cdecl", Ignore));
  867|     assert(Lex::RecordKeyword("_cdecl", Ignore));
  868|     assert(Lex::RecordKeyword("__cdecl", Ignore));
  869|
  870|     assert(Lex::RecordKeyword("_fastcall", Ignore));
  871|     assert(Lex::RecordKeyword("__fastcall", Ignore));
  872|
  873|     assert(Lex::RecordKeyword("_based", Ignore));
  874|     assert(Lex::RecordKeyword("__based", Ignore));
  875|
  876|     assert(Lex::RecordKeyword("_asm", ASM));
  877|     assert(Lex::RecordKeyword("__asm", ASM));
  878|
  879|     assert(Lex::RecordKeyword("_inline", INLINE));
  880|     assert(Lex::RecordKeyword("__inline", INLINE));
  881|
  882|     assert(Lex::RecordKeyword("_stdcall", Ignore));
  883|     assert(Lex::RecordKeyword("__stdcall", Ignore));
  884|
  885|     assert(Lex::RecordKeyword("__declspec", DECLSPEC));
  886|
  887|     assert(Lex::RecordKeyword("__int8",  CHAR));
  888|     assert(Lex::RecordKeyword("__int16", SHORT));
  889|     assert(Lex::RecordKeyword("__int32", INT));
  890|     assert(Lex::RecordKeyword("__int64",  INT64));
  891| #endif
  892| }
  893|
  894| int Lex::Screening(char *identifier, int len)
  895| {
  896|     struct rw_table     *low, *high, *mid;
  897|     int               c, token;
  898|
  899|     low = table;
  900|     high = &table[sizeof(table) / sizeof(table[0]) - 1];
  901|     while(low <= high){
  902|         mid = low + (high - low) / 2;
  903|         if((c = strncmp(mid->name, identifier, len)) == 0)
  904|             if(mid->name[len] == '\0')
  905|                return mid->value;
  906|         else
  907|                high = mid - 1;
  908|         else if(c < 0)
  909|             low = mid + 1;
  910|         else
  911|             high = mid - 1;
  912|     }
  913|
  914|     if(user_keywords == nil)
  915|         user_keywords = new HashTable;
  916|
  917|     if(user_keywords->Lookup(identifier, len, (HashValue*)&token))
  918|         return token;
  919|
  920|     return token(Identifier);
  921| }
  922|
  923| int Lex::ReadSeparator(char c, uint top)
  924| {
  925|     char c1 = file->Get();
  926|
  927|     token_len = 2;
  928|     if(c1 == '='){
  929|         switch(c){
  930|         case '*' :
  931|         case '/' :
  932|         case '%' :
  933|         case '+' :
  934|         case '-' :
  935|         case '&' :
  936|         case '^' :
  937|         case '|' :
  938|             return token(AssignOp);
  939|         case '=' :
  940|         case '!' :
  941|             return token(EqualOp);
  942|         case '<' :
  943|         case '>' :
  944|             return token(RelOp);
  945|         default :
  946|             file->Unget();
  947|             token_len = 1;
  948|             return SingleCharOp(c);
  949|         }
  950|     }
  951|     else if(c == c1){
  952|         switch(c){
  953|         case '<' :
  954|         case '>' :
  955|             if(file->Get() != '='){
  956|                file->Unget();
  957|                return token(ShiftOp);
  958|         }
  959|          else{
  960|                token_len = 3;
  961|                return token(AssignOp);
  962|         }
  963|         case '|' :
  964|             return token(LogOrOp);
  965|         case '&' :
  966|             return token(LogAndOp);
  967|         case '+' :
  968|         case '-' :
  969|             return token(IncOp);
  970|         case ':' :
  971|             return token(Scope);
  972|         case '.' :
  973|             if(file->Get() == '.'){
  974|                token_len = 3;
  975|                return token(Ellipsis);
  976|         }
  977|         else
  978|                file->Unget();
  979|         case '/' :
  980|             return ReadComment(c1, top);
  981|         default :
  982|             file->Unget();
  983|             token_len = 1;
  984|             return SingleCharOp(c);
  985|         }
  986|     }
  987|     else if(c == '.' && c1 == '*')
  988|         return token(PmOp);
  989|     else if(c == '-' && c1 == '>')
  990|         if(file->Get() == '*'){
  991|             token_len = 3;
  992|             return token(PmOp);
  993|         }
  994|         else{
  995|             file->Unget();
  996|             return token(ArrowOp);
  997|         }
  998|     else if(c == '/' && c1 == '*')
  999|         return ReadComment(c1, top);
1000|     else{
1001|         file->Unget();
1002|         token_len = 1;
1003|         return SingleCharOp(c);
1004|     }
1005|
1006|     std::cerr << "*** An invalid character has been found! ("
1007|          << (int)c << ',' << (int)c1 << ")\n";
1008|     return token(BadToken);
1009| }
1010|
1011| int Lex::SingleCharOp(unsigned char c)
1012| {
1013|                       /* !"#$%&'()*+,-./0123456789:;<=>? */
1014|     static char valid[] = "x   xx xxxxxxxx          xxxxxx";
1015|
1016|     if('!' <= c && c <= '?' && valid[c - '!'] == 'x')
1017|         return c;
1018|     else if(c == '[' || c == ']' || c == '^')
1019|         return c;
1020|     else if('{' <= c && c <= '~')
1021|         return c;
1022|     else if(c == '#') {
1023|         // Skip to end of line
1024|         do {
1025|             c = file->Get();
1026|         }while(c != '\n' && c != '\0');
1027|         return Ignore;
1028|     } else {
1029|         std::cerr << "*** An invalid character has been found! ("<<(char)c<<")"<< std::endl;
1030|         return token(BadToken);
1031|     }
1032| }
1033|
1034| int Lex::ReadComment(char c, uint top) {
1035|     uint len = 0;
1036|     if (c == '*')       // a nested C-style comment is proh
1037|         do {
1038|             c = file->Get();
1039|             if (c == '*') {
1040|                c = file->Get();
1041|                if (c == '/') {
1042|                len = 1;
1043|                break;
1044|         }
1045|         else
1046|                   file->Unget();
1047|         }
1048|         }while(c != '\0');
1049|     else /* if (c == '/') */
1050|         do {
1051|             c = file->Get();
1052|         }while(c != '\n' && c != '\0');
1053|
1054|     len += file->GetCurPos() - top;
1055|     token_len = int(len);
1056|     Leaf* node = new Leaf((char*)file->Read(top), int(len));
1057|     comments = Ptree::Snoc(comments, node);
1058|     return Ignore;
1059| }
1060|
1061| Ptree* Lex::GetComments() {
1062|     Ptree* c = comments;
1063|     comments = nil;
1064|     return c;
1065| }
1066|
1067| Ptree* Lex::GetComments2() {
1068|     return comments;
1069| }
1070|
1071| #ifdef TEST
1072| #include <stdio.h>
1073|
1074| main()
1075| {
1076|     int   i = 0;
1077|     Token token;
1078|
1079|     Lex lex(new ProgramFromStdin);
1080|     for(;;){
1081| //      int t = lex.GetToken(t
1082|         int t = lex.LookAhead(i++, token);
1083|         if(t == 0)
1084|           break;
1085|         else if(t < 128)
1086|             printf("%c (%x): ", t, t);
1087|         else
1088|             printf("%-10.10s (%x): ", (char*)t, t);
1089|
1090|         putchar('"');
1091|         while(token.len-- > 0)
1092|             putchar(*token.ptr++);
1093|
1094|         puts("\"");
1095|     };
1096| }
1097| #endif
1098|
1099| /*
1100|
1101| line directive:
1102| ^"#"{blank}*{digit}+({blank}+.*)?\n
1103|
1104| pragma directive:
1105| ^"#"{blank}*"pragma".*\n
1106|
1107| Constant        {digit}+{int_
1108|                "0"{xletter}{hexdigit}+{int_suffix}*
1109|                {digit}*\.{digit}+{float_suffix}*
1110|                {digit}+\.{float_suffix}*
1111|                {digit}*\.{digit}+"e"("+"|"-")*{digit}+{float_suffix}*
1112|                {digit}+\."e"("+"|"-")*{digit}+{float_suffix}*
1113|                {digit}+"e"("+"|"-")*{digit}+{float_suffix}*
1114|
1115| CharConst       \'([^'\n]|\\[
1116|
1117| StringL        \"([^"\n]|\\[
1118|
1119| Identifier      {letter}+({letter}|{di
1120|
1121| AssignOp        *= /= %= += -= &= ^=
1122|
1123| EqualOp
1124|
1125| RelOp
1126|
1127| ShiftOp
1128|
1129| LogOrOp
1130|
1131| LogAndOp
1132|
1133| IncOp
1134|
1135| Scope
1136|
1137| Ellipsis
1138|
1139| PmOp
1140|
1141| ArrowOp
1142|
1143| others         !%^&*()-+={}|~[
1144|
1145| BadToken
1146|
1147| */