SQLite--Tokenize(词法分析器)

SQLite的词法分析器的代码逻辑其实很简单，但是其中也不乏SQLite对于执行效率的优化。甚至可以说SQLite在追求高效的每一个细节点，都无所不用其极。文章从源码的角度分别看具体的实现逻辑与优化的细节

关于前端的大致结构

Tokenize是SQLite的最前端，它的职责就是，将用户输入的一个SQL字符串，分割为一个一个独立的词。并且以特征ID的形式一个一个输入给Parser，而不是一个一个单词。因为parser只认识这些特征ID。

先说一下Parser

Parser是SQLite开发的一个语法分析器。在$root/tool/目录下有一个lemon.c的工具。SQLite是通过cmake来配置源码的编译，在autoconf配置下产生的Makefile中有几个关于这个的指令:

# 编译lemon.c，生成lemon可执行文件
#
lemon$(BEXE):	$(TOP)/tool/lemon.c $(TOP)/tool/lempar.c
	$(BCC) -o $@ $(TOP)/tool/lemon.c
	cp $(TOP)/tool/lempar.c .

# parse.h 和 parse.c文件都依赖于lemon可执行文件
# 通过command可以看到，lemon依赖parse.y这个文件生成了parse.h和parse.c
#
parse.h:	parse.c

parse.c:	$(TOP)/src/parse.y lemon$(BEXE) $(TOP)/tool/addopcodes.tcl
	cp $(TOP)/src/parse.y .
	rm -f parse.h
	./lemon$(BEXE) $(OPT_FEATURE_FLAGS) $(OPTS) parse.y
	mv parse.h parse.h.temp
	$(TCLSH_CMD) $(TOP)/tool/addopcodes.tcl parse.h.temp >parse.h

# fts5也是使用的lemon程序，通过fts5parse.y的定义来生成解析器
fts5parse.c:	$(TOP)/ext/fts5/fts5parse.y lemon
	cp $(TOP)/ext/fts5/fts5parse.y .
	rm -f fts5parse.h
	./lemon$(BEXE) $(OPTS) fts5parse.y

Parser的核心代码都是使用lemon这个工具生成的，通过不断地输入在parse.h中定义的特征ID来生成一个语法树。

那特征ID哪来呢？就是上面已经提到的Tokenize。

在SQLite API中的
int sqlite3_prepare();
int sqlite3_prepare_v2();
int sqlite3_prepare_v3();

三个SQL预处理方法都是通过先调用tokenize，然后调用parser，通过vm的编译生成字节码

大致的函数调用时序:

词法分析器源码

tokenize.c的源码结构非常简单。只有两个核心函数 int sqlite3GetToken(const unsigned char *z, int *tokenType) 与 int sqlite3RunParser(Parse *pParse, const char *zSql, char **pzErrMsg)

prepare函数会直接调用sqlite3RunParser函数，这个函数虽然很长，但是把核心代码提取出来后，就一目了然。

int sqlite3RunParser(Parse *pParse, const char *zSql, char **pzErrMsg){
  int nErr = 0;                   /* Number of errors encountered */
  void *pEngine;                  /* The LEMON-generated LALR(1) parser */
  int n = 0;                      /* Length of the next token token */
  int tokenType;                  /* type of the next token */
  int lastTokenParsed = -1;       /* type of the previous token */
  sqlite3 *db = pParse->db;       /* The database connection */
  int mxSqlLen;                   /* Max length of an SQL string */
  pParse->zTail = zSql;

  pEngine = sqlite3ParserAlloc(sqlite3Malloc, pParse);

  while( 1 ){
    n = sqlite3GetToken((u8*)zSql, &tokenType);
    mxSqlLen -= n;
    if( mxSqlLen<0 ){
      pParse->rc = SQLITE_TOOBIG;
      break;
    }
    if( tokenType>=TK_SPACE ){
      assert( tokenType==TK_SPACE || tokenType==TK_ILLEGAL );
      if( db->u1.isInterrupted ){
        pParse->rc = SQLITE_INTERRUPT;
        break;
      }
      if( tokenType==TK_SPACE ){
        zSql += n;
        continue;
      }
      if( zSql[0]==0 ){
        /* Upon reaching the end of input, call the parser two more times
        ** with tokens TK_SEMI and 0, in that order. */
        if( lastTokenParsed==TK_SEMI ){
          tokenType = 0;
        }else if( lastTokenParsed==0 ){
          break;
        }else{
          tokenType = TK_SEMI;
        }
        n = 0;
      }else{
        sqlite3ErrorMsg(pParse, "unrecognized token: \"%.*s\"", n, zSql);
        break;
      }
    }
    pParse->sLastToken.z = zSql;
    pParse->sLastToken.n = n;
    sqlite3Parser(pEngine, tokenType, pParse->sLastToken);
    lastTokenParsed = tokenType;
    zSql += n;
    if( pParse->rc!=SQLITE_OK || db->mallocFailed ) break;
  }
  // mem free code...
  return nErr;
}

这样似乎看起来还是很多，再精简一下。

while( 1 ){
    n = sqlite3GetToken((u8*)zSql, &tokenType);
    sqlite3Parser(pEngine, tokenType, pParse->sLastToken);    
}

只有两个步骤，取词，调用parser，然后不断地循环，一直到全部读完为止。这也印证了上面的话。

从SQL语句中取词无非也就是根据分割符，不断的裁剪分割罢了。但是开头提到，SQLite为了追求效率，做了很多优化。

后面详细说一下优化的细节。

词法分析器效率的优化

sqlite3RunParser函数主要是流程化的代码，主要优化都在sqlite3GetToken函数内，看下源码。

/*
** Return the length (in bytes) of the token that begins at z[0]. 
** Store the token type in *tokenType before returning.
*/
int sqlite3GetToken(const unsigned char *z, int *tokenType){
  int i, c;
  switch( aiClass[*z] ){  /* Switch on the character-class of the first byte
                          ** of the token. See the comment on the CC_ defines
                          ** above. */
    case CC_SPACE: {
      testcase( z[0]==' ' );
      testcase( z[0]=='\t' );
      testcase( z[0]=='\n' );
      testcase( z[0]=='\f' );
      testcase( z[0]=='\r' );
      for(i=1; sqlite3Isspace(z[i]); i++){}
      *tokenType = TK_SPACE;
      return i;
    }
    case CC_VARNUM: {
      *tokenType = TK_VARIABLE;
      for(i=1; sqlite3Isdigit(z[i]); i++){}
      return i;
    }

    // .... more case

    case CC_KYWD: {
      for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
      if( IdChar(z[i]) ){
        /* This token started out using characters that can appear in keywords,
        ** but z[i] is a character not allowed within keywords, so this must
        ** be an identifier instead */
        i++;
        break;
      }
      *tokenType = TK_ID;
      return keywordCode((char*)z, i, tokenType);
    }
    case CC_X: {
#ifndef SQLITE_OMIT_BLOB_LITERAL
      testcase( z[0]=='x' ); testcase( z[0]=='X' );
      if( z[1]=='\'' ){
        *tokenType = TK_BLOB;
        for(i=2; sqlite3Isxdigit(z[i]); i++){}
        if( z[i]!='\'' || i%2 ){
          *tokenType = TK_ILLEGAL;
          while( z[i] && z[i]!='\'' ){ i++; }
        }
        if( z[i] ) i++;
        return i;
      }
#endif
      /* If it is not a BLOB literal, then it must be an ID, since no
      ** SQL keywords start with the letter 'x'.  Fall through */
    }
    case CC_ID: {
      i = 1;
      break;
    }
    case CC_NUL: {
      *tokenType = TK_ILLEGAL;
      return 0;
    }
    default: {
      *tokenType = TK_ILLEGAL;
      return 1;
    }
  }
  while( IdChar(z[i]) ){ i++; }
  *tokenType = TK_ID;
  return i;
}

从代码可以看到，sqlite3GetToken输入一个SQL字符串，然后根据这个串的第一个字符判断是什么类型(CC_SPACE,CC_NUL,…)等等，用一个巨大的switch-case来处理不同的词，因为这个函数需要依次返回各种词的类型(tokenType)，也就是上文提到的在Parser.h中定义的类型。

但是…aiClass[*z]这个是干什么用的？为什么不直接使用switch(*z)？

这里就不得不讲到 Switch-Case 的编译实现了。

switch-case在使用的时候编译器会根据case中的constant值来决定使用哪种方式做检索。具体了解的话可以看这篇文章关于C/C++ switch语句你也许不知道的一些事

编译器使用二级表的方式，将case下的代码块与一个维表索引形成映射关系，以及将case的值与这个一维表再形成索引关系。
编译器使用一级表的方式，将case下的代码块直接和case的值形成映射关系。
编译器使用二分查找的方式，通过O(logn)的时间复杂度来查找。

根据case下不同的值，或者值不同的大小，编译器会选择不同的实现方式。前两种是O(1)的时间复杂度，第三种的时间复杂度是O(logn)。SQLite为了保证它能够在能高的效率下执行，也做了一次类似的”二级映射”。

只考虑ASCII码的情况:

ASCII码可以使用7bit或者8bit来表示128或者256种字符，后128个是扩展ASCII。SQLite对这256个ASCII做了一个一维映射。构建了一个256个元素的一维表，而这个表就是上面看到的aiClass表。表内的值，就代表了这个ASCII码对应的符号的类型。SQLite将所有符号的类型归纳为29种，可以在tokenize.c的前半部分看到所有的29个宏定义。之所以这样做，就是为了能够在switch-case执行的时候让编译器可以在O(1)的时间复杂度内完成索引与比较。

下面列出了ASCII码对应的 aiClass 表以及对应的类型。

/* Character classes for tokenizing
**
** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
** using a lookup table, whereas a switch() directly on c uses a binary search.
** The lookup table is much faster.  To maximize speed, and to ensure that
** a lookup table is used, all of the classes need to be small integers and
** all of them need to be used within the switch.
*/
#define CC_X          0    /* The letter 'x', or start of BLOB literal */
#define CC_KYWD       1    /* Alphabetics or '_'.  Usable in a keyword */
#define CC_ID         2    /* unicode characters usable in IDs */
#define CC_DIGIT      3    /* Digits */
#define CC_DOLLAR     4    /* '$' */
#define CC_VARALPHA   5    /* '@', '#', ':'.  Alphabetic SQL variables */
#define CC_VARNUM     6    /* '?'.  Numeric SQL variables */
#define CC_SPACE      7    /* Space characters */
#define CC_QUOTE      8    /* '"', '\'', or '`'.  String literals, quoted ids */
#define CC_QUOTE2     9    /* '['.   [...] style quoted ids */
#define CC_PIPE      10    /* '|'.   Bitwise OR or concatenate */
#define CC_MINUS     11    /* '-'.  Minus or SQL-style comment */
#define CC_LT        12    /* '<'.  Part of < or <= or <> */
#define CC_GT        13    /* '>'.  Part of > or >= */
#define CC_EQ        14    /* '='.  Part of = or == */
#define CC_BANG      15    /* '!'.  Part of != */
#define CC_SLASH     16    /* '/'.  / or c-style comment */
#define CC_LP        17    /* '(' */
#define CC_RP        18    /* ')' */
#define CC_SEMI      19    /* ';' */
#define CC_PLUS      20    /* '+' */
#define CC_STAR      21    /* '*' */
#define CC_PERCENT   22    /* '%' */
#define CC_COMMA     23    /* ',' */
#define CC_AND       24    /* '&' */
#define CC_TILDA     25    /* '~' */
#define CC_DOT       26    /* '.' */
#define CC_ILLEGAL   27    /* Illegal character */
#define CC_NUL       28    /* 0x00 */

static const unsigned char aiClass[] = {
#ifdef SQLITE_ASCII
/*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
/* 0x */   28, 27, 27, 27, 27, 27, 27, 27, 27,  7,  7, 27,  7,  7, 27, 27,
/* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
/* 2x */    7, 15,  8,  5,  4, 22, 24,  8, 17, 18, 21, 20, 23, 11, 26, 16,
/* 3x */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5, 19, 12, 14, 13,  6,
/* 4x */    5,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
/* 5x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  9, 27, 27, 27,  1,
/* 6x */    8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
/* 7x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 10, 27, 25, 27,
/* 8x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, //后128个字符都是 2，也就是 CC_ID。
/* 9x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
/* Ax */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
/* Bx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
/* Cx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
/* Dx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
/* Ex */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
/* Fx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
#endif
};

在case CC_KYWD的情况下，获取到一个完整的单词之后，通过调用keywordCode函数，还要再次判断，这个单词是否是SQLite的关键字，并且返回具体的关键字的特征ID。关于keywordCode函数的原理和优化，可以看后续的SQLite–KeywordHash