SQLite—-Page Cache之事务处理(2)

写在前面:个人认为pager层是SQLite实现最为核心的模块,它具有四大功能:I/O,页面缓存,并发控制和日志恢复。而这些功能不仅是上层Btree的基础,而且对系统的性能和健壮性有关至关重要的影响。其中并发控制和日志恢复是事务处理实现的基础。SQLite并发控制的机制非常简单——封锁机制;别外,它的查询优化机制也非常简单——基于索引。这一切使得整个SQLite的实现变得简单,SQLite变得很小,运行速度也非常快,所以,特别适合嵌入式设备。好了,接下来讨论事务的剩余部分。

6、修改位于用户进程空间的页面(Changing Database Pages In User Space)

页面的原始数据写入日志之后,就可以修改页面了——位于用户进程空间。每个数据库连接都有自己私有的空间,所以页面的变化只对该连接可见,而对其它连接的数据仍然是磁盘缓存中的数据。从这里可以明白一件事:一个进程在修改页面数据的同时,其它进程可以继续进行读操作。图中的红色表示修改的页面。 document/2015-09-15/55f7c436736ac

7、日志文件刷入磁盘(Flushing The Rollback Journal File To Mass Storage)

接下来把日志文件的内容刷入磁盘,这对于数据库从意外中恢复来说是至关重要的一步。而且这通常也是一个耗时的操作,因为磁盘I/O速度很慢。 这个步骤不只把日志文件刷入磁盘那么简单,它的实现实际上分成两步:首先把日志文件的内容刷入磁盘(即页面数据);然后把日志文件中页面的数目写入日志文件头,再把header刷入磁盘(这一过程在代码中清晰可见)。 document/2015-09-15/55f7c44737552 代码如下:

  1. /*
  2. **Sync日志文件,保证所有的脏页面写入磁盘日志文件
  3. */
  4. static int syncJournal(Pager *pPager){
  5. PgHdr *pPg;
  6. int rc = SQLITE_OK;
  7. /* Sync the journal before modifying the main database
  8. ** (assuming there is a journal and it needs to be synced.)
  9. */
  10. if( pPager->needSync ){
  11. if( !pPager->tempFile ){
  12. assert( pPager->journalOpen );
  13. /* assert( !pPager->noSync ); // noSync might be set if synchronous
  14. ** was turned off after the transaction was started. Ticket #615 */
  15. #ifndef NDEBUG
  16. {
  17. /* Make sure the pPager->nRec counter we are keeping agrees
  18. ** with the nRec computed from the size of the journal file.
  19. */
  20. i64 jSz;
  21. rc = sqlite3OsFileSize(pPager->jfd, &jSz);
  22. if( rc!=0 ) return rc;
  23. assert( pPager->journalOff==jSz );
  24. }
  25. #endif
  26. {
  27. /* Write the nRec value into the journal file header. If in
  28. ** full-synchronous mode, sync the journal first. This ensures that
  29. ** all data has really hit the disk before nRec is updated to mark
  30. ** it as a candidate for rollback.
  31. */
  32. if( pPager->fullSync ){
  33. TRACE2("SYNC journal of %d\n", PAGERID(pPager));
  34. //首先保证脏页面中所有的数据都已经写入日志文件
  35. rc = sqlite3OsSync(pPager->jfd, 0);
  36. if( rc!=0 ) return rc;
  37. }
  38. rc = sqlite3OsSeek(pPager->jfd,
  39. pPager->journalHdr + sizeof(aJournalMagic));
  40. if( rc ) return rc;
  41. //页面的数目写入日志文件
  42. rc = write32bits(pPager->jfd, pPager->nRec);
  43. if( rc ) return rc;
  44. rc = sqlite3OsSeek(pPager->jfd, pPager->journalOff);
  45. if( rc ) return rc;
  46. }
  47. TRACE2("SYNC journal of %d\n", PAGERID(pPager));
  48. rc = sqlite3OsSync(pPager->jfd, pPager->full_fsync);
  49. if( rc!=0 ) return rc;
  50. pPager->journalStarted = 1;
  51. }
  52. pPager->needSync = 0;
  53. /* Erase the needSync flag from every page.
  54. */
  55. //清除needSync标志位
  56. for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
  57. pPg->needSync = 0;
  58. }
  59. pPager->pFirstSynced = pPager->pFirst;
  60. }
  61. #ifndef NDEBUG
  62. /* If the Pager.needSync flag is clear then the PgHdr.needSync
  63. ** flag must also be clear for all pages. Verify that this
  64. ** invariant is true.
  65. */
  66. else{
  67. for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
  68. assert( pPg->needSync==0 );
  69. }
  70. assert( pPager->pFirstSynced==pPager->pFirst );
  71. }
  72. #endif
  73. return rc;
  74. }

8、获取排斥锁(Obtaining An Exclusive Lock)

在对数据库文件进行修改之前(注:这里不是内存中的页面),我们必须得到数据库文件的排斥锁(Exclusive Lock)。得到排斥锁的过程可分为两步:首先得到Pending lock;然后Pending lock升级到exclusive lock。 Pending lock允许其它已经存在的Shared lock继续读数据库文件,但是不允许产生新的shared lock,这样做目的是为了防止写操作发生饿死情况。一旦所有的shared lock完成操作,则pending lock升级到exclusive lock。 document/2015-09-15/55f7c46bdecfc

9、修改的页面写入文件(Writing Changes To The Database File)

一旦得到exclusive lock,其它的进程就不能进行读操作,此时就可以把修改的页面写回数据库文件,但是通常OS都把结果暂时保存到磁盘缓存中,直到某个时刻才会真正把结果写入磁盘。 document/2015-09-15/55f7c489ad126

以上两步的实现代码:

  1. //把所有的脏页面写入数据库
  2. //到这里开始获取EXCLUSIVEQ锁,并将页面写回操作系统文件
  3. static int pager_write_pagelist(PgHdr *pList){
  4. Pager *pPager;
  5. int rc;
  6. if( pList==0 ) return SQLITE_OK;
  7. pPager = pList->pPager;
  8. /* At this point there may be either a RESERVED or EXCLUSIVE lock on the
  9. ** database file. If there is already an EXCLUSIVE lock, the following
  10. ** calls to sqlite3OsLock() are no-ops.
  11. **
  12. ** Moving the lock from RESERVED to EXCLUSIVE actually involves going
  13. ** through an intermediate state PENDING. A PENDING lock prevents new
  14. ** readers from attaching to the database but is unsufficient for us to
  15. ** write. The idea of a PENDING lock is to prevent new readers from
  16. ** coming in while we wait for existing readers to clear.
  17. **
  18. ** While the pager is in the RESERVED state, the original database file
  19. ** is unchanged and we can rollback without having to playback the
  20. ** journal into the original database file. Once we transition to
  21. ** EXCLUSIVE, it means the database file has been changed and any rollback
  22. ** will require a journal playback.
  23. */
  24. //加EXCLUSIVE_LOCK锁
  25. rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
  26. if( rc!=SQLITE_OK ){
  27. return rc;
  28. }
  29. while( pList ){
  30. assert( pList->dirty );
  31. rc = sqlite3OsSeek(pPager->fd, (pList->pgno-1)*(i64)pPager->pageSize);
  32. if( rc ) return rc;
  33. /* If there are dirty pages in the page cache with page numbers greater
  34. ** than Pager.dbSize, this means sqlite3pager_truncate() was called to
  35. ** make the file smaller (presumably by auto-vacuum code). Do not write
  36. ** any such pages to the file.
  37. */
  38. if( pList->pgno<=pPager->dbSize ){
  39. char *pData = CODEC2(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
  40. TRACE3("STORE %d page %d\n", PAGERID(pPager), pList->pgno);
  41. //写入文件
  42. rc = sqlite3OsWrite(pPager->fd, pData, pPager->pageSize);
  43. TEST_INCR(pPager->nWrite);
  44. }
  45. #ifndef NDEBUG
  46. else{
  47. TRACE3("NOSTORE %d page %d\n", PAGERID(pPager), pList->pgno);
  48. }
  49. #endif
  50. if( rc ) return rc;
  51. //设置dirty
  52. pList->dirty = 0;
  53. #ifdef SQLITE_CHECK_PAGES
  54. pList->pageHash = pager_pagehash(pList);
  55. #endif
  56. //指向下一个脏页面
  57. pList = pList->pDirty;
  58. }
  59. return SQLITE_OK;
  60. }

10、修改结果刷入存储设备(Flushing Changes To Mass Storage)

为了保证修改结果真正写入磁盘,这一步必不要少。对于数据库存的完整性,这一步也是关键的一步。由于要进行实际的I/O操作,所以和第7步一样,将花费较多的时间。

document/2015-09-15/55f7c4b293ca6 最后来看看这几步是如何实现的:

其实以上以上几步是在函数sqlite3BtreeSync()—-btree.c中调用的(而关于该函数的调用后面再讲)。

代码如下:

  1. //同步btree对应的数据库文件
  2. //该函数返回之后,只需要提交写事务,删除日志文件
  3. int sqlite3BtreeSync(Btree *p, const char *zMaster){
  4. int rc = SQLITE_OK;
  5. if( p->inTrans==TRANS_WRITE ){
  6. BtShared *pBt = p->pBt;
  7. Pgno nTrunc = 0;
  8. #ifndef SQLITE_OMIT_AUTOVACUUM
  9. if( pBt->autoVacuum ){
  10. rc = autoVacuumCommit(pBt, &nTrunc);
  11. if( rc!=SQLITE_OK ){
  12. return rc;
  13. }
  14. }
  15. #endif
  16. //调用pager进行sync
  17. rc = sqlite3pager_sync(pBt->pPager, zMaster, nTrunc);
  18. }
  19. return rc;
  20. }
  21. //把pager所有脏页面写回文件
  22. int sqlite3pager_sync(Pager *pPager, const char *zMaster, Pgno nTrunc){
  23. int rc = SQLITE_OK;
  24. TRACE4("DATABASE SYNC: File=%s zMaster=%s nTrunc=%d\n",
  25. pPager->zFilename, zMaster, nTrunc);
  26. /* If this is an in-memory db, or no pages have been written to, or this
  27. ** function has already been called, it is a no-op.
  28. */
  29. //pager不处于PAGER_SYNCED状态,dirtyCache为1,
  30. //则进行sync操作
  31. if( pPager->state!=PAGER_SYNCED && !MEMDB && pPager->dirtyCache ){
  32. PgHdr *pPg;
  33. assert( pPager->journalOpen );
  34. /* If a master journal file name has already been written to the
  35. ** journal file, then no sync is required. This happens when it is
  36. ** written, then the process fails to upgrade from a RESERVED to an
  37. ** EXCLUSIVE lock. The next time the process tries to commit the
  38. ** transaction the m-j name will have already been written.
  39. */
  40. if( !pPager->setMaster ){
  41. //pager修改计数
  42. rc = pager_incr_changecounter(pPager);
  43. if( rc!=SQLITE_OK ) goto sync_exit;
  44. #ifndef SQLITE_OMIT_AUTOVACUUM
  45. if( nTrunc!=0 ){
  46. /* If this transaction has made the database smaller, then all pages
  47. ** being discarded by the truncation must be written to the journal
  48. ** file.
  49. */
  50. Pgno i;
  51. void *pPage;
  52. int iSkip = PAGER_MJ_PGNO(pPager);
  53. for( i=nTrunc+1; i<=pPager->origDbSize; i++ ){
  54. if( !(pPager->aInJournal[i/8] & (1<<(i&7))) && i!=iSkip ){
  55. rc = sqlite3pager_get(pPager, i, &pPage);
  56. if( rc!=SQLITE_OK ) goto sync_exit;
  57. rc = sqlite3pager_write(pPage);
  58. sqlite3pager_unref(pPage);
  59. if( rc!=SQLITE_OK ) goto sync_exit;
  60. }
  61. }
  62. }
  63. #endif
  64. rc = writeMasterJournal(pPager, zMaster);
  65. if( rc!=SQLITE_OK ) goto sync_exit;
  66. //sync日志文件
  67. rc = syncJournal(pPager);
  68. if( rc!=SQLITE_OK ) goto sync_exit;
  69. }
  70. #ifndef SQLITE_OMIT_AUTOVACUUM
  71. if( nTrunc!=0 ){
  72. rc = sqlite3pager_truncate(pPager, nTrunc);
  73. if( rc!=SQLITE_OK ) goto sync_exit;
  74. }
  75. #endif
  76. /* Write all dirty pages to the database file */
  77. pPg = pager_get_all_dirty_pages(pPager);
  78. //把所有脏页面写回操作系统文件
  79. rc = pager_write_pagelist(pPg);
  80. if( rc!=SQLITE_OK ) goto sync_exit;
  81. /* Sync the database file. */
  82. //sync数据库文件
  83. if( !pPager->noSync ){
  84. rc = sqlite3OsSync(pPager->fd, 0);
  85. }
  86. pPager->state = PAGER_SYNCED;
  87. }else if( MEMDB && nTrunc!=0 ){
  88. rc = sqlite3pager_truncate(pPager, nTrunc);
  89. }
  90. sync_exit:
  91. return rc;
  92. }

下图可以进一步解释该过程:

document/2015-09-15/55f7c4d53d798