在本节中,我将会介绍索引文件sph的生成,从上一节我们得知sph文件保存了Sphinx的索引元信息以及一些索引相关的配置信息

SPH文件生成

先来看代码,其中sph文件的生成是在CSphIndex_VLN::WriteHeader这个函数中:

  1. bool CSphIndex_VLN::WriteHeader ( const BuildHeader_t & tBuildHeader, CSphWriter & fdInfo ) const
  2. {
  3. // version
  4. fdInfo.PutDword ( INDEX_MAGIC_HEADER );
  5. fdInfo.PutDword ( INDEX_FORMAT_VERSION );
  6. // bits
  7. fdInfo.PutDword ( USE_64BIT );
  8. // docinfo
  9. fdInfo.PutDword ( m_tSettings.m_eDocinfo );
  10. // schema
  11. WriteSchema ( fdInfo, m_tSchema );
  12. // min doc
  13. fdInfo.PutOffset ( tBuildHeader.m_uMinDocid ); // was dword in v.1
  14. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  15. fdInfo.PutBytes ( tBuildHeader.m_pMinRow, m_tSchema.GetRowSize()*sizeof(CSphRowitem) );
  16. // wordlist checkpoints
  17. fdInfo.PutOffset ( tBuildHeader.m_iDictCheckpointsOffset );
  18. fdInfo.PutDword ( tBuildHeader.m_iDictCheckpoints );
  19. fdInfo.PutByte ( tBuildHeader.m_iInfixCodepointBytes );
  20. fdInfo.PutDword ( (DWORD)tBuildHeader.m_iInfixBlocksOffset );
  21. fdInfo.PutDword ( tBuildHeader.m_iInfixBlocksWordsSize );
  22. // index stats
  23. fdInfo.PutDword ( (DWORD)tBuildHeader.m_iTotalDocuments ); // FIXME? we don't expect over 4G docs per just 1 local index
  24. fdInfo.PutOffset ( tBuildHeader.m_iTotalBytes );
  25. fdInfo.PutDword ( tBuildHeader.m_iTotalDups );
  26. // index settings
  27. SaveIndexSettings ( fdInfo, m_tSettings );
  28. // tokenizer info
  29. assert ( m_pTokenizer );
  30. SaveTokenizerSettings ( fdInfo, m_pTokenizer, m_tSettings.m_iEmbeddedLimit );
  31. // dictionary info
  32. assert ( m_pDict );
  33. SaveDictionarySettings ( fdInfo, m_pDict, false, m_tSettings.m_iEmbeddedLimit );
  34. fdInfo.PutDword ( tBuildHeader.m_uKillListSize );
  35. fdInfo.PutOffset ( tBuildHeader.m_iMinMaxIndex );
  36. // field filter info
  37. SaveFieldFilterSettings ( fdInfo, m_pFieldFilter );
  38. // average field lengths
  39. if ( m_tSettings.m_bIndexFieldLens )
  40. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  41. fdInfo.PutOffset ( m_dFieldLens[i] );
  42. return true;
  43. }

然后按顺序来解释下每一项字段的含义.

  • 前两个字段INDEX_MAGIC_HEADER和INDEX_FORMAT_VERSION分别是magic number和索引版本号
  • 第三个字段USE_64BIT表示是否使用64位的document和word id(默认是使用).
  • 然后是写入docinfo,这个字段也就是配置中的docinfo字段(index block中)
  • 接下来将会写入schema,也就是索引的schema信息,比如当前索引的字段名,当前需要建立的属性名等等.
  1. void WriteSchema ( CSphWriter & fdInfo, const CSphSchema & tSchema )
  2. {
  3. // schema
  4. fdInfo.PutDword ( tSchema.m_dFields.GetLength() );
  5. ARRAY_FOREACH ( i, tSchema.m_dFields )
  6. WriteSchemaColumn ( fdInfo, tSchema.m_dFields[i] );
  7. fdInfo.PutDword ( tSchema.GetAttrsCount() );
  8. for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
  9. WriteSchemaColumn ( fdInfo, tSchema.GetAttr(i) );
  10. }
  • 然后是写入当前索引集的最小doc id(m_uMinDocid)
  • 接下来是根据docinfo(也就是属性存储)的配置来选择是否写入行信息(当docinfo为inline的话,表示attribute value 将会存储在spd文件中).
  • 然后是写入wordlist的checkpoint.
  • 然后是索引的统计信息(m_iTotalDocuments/m_iTotalBytes/m_iTotalDups).
  • 接下来是写入对应的索引配置信息
  1. void SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettings )
  2. {
  3. tWriter.PutDword ( tSettings.m_iMinPrefixLen );
  4. tWriter.PutDword ( tSettings.m_iMinInfixLen );
  5. tWriter.PutDword ( tSettings.m_iMaxSubstringLen );
  6. tWriter.PutByte ( tSettings.m_bHtmlStrip ? 1 : 0 );
  7. tWriter.PutString ( tSettings.m_sHtmlIndexAttrs.cstr () );
  8. tWriter.PutString ( tSettings.m_sHtmlRemoveElements.cstr () );
  9. tWriter.PutByte ( tSettings.m_bIndexExactWords ? 1 : 0 );
  10. tWriter.PutDword ( tSettings.m_eHitless );
  11. tWriter.PutDword ( tSettings.m_eHitFormat );
  12. tWriter.PutByte ( tSettings.m_bIndexSP );
  13. tWriter.PutString ( tSettings.m_sZones );
  14. tWriter.PutDword ( tSettings.m_iBoundaryStep );
  15. tWriter.PutDword ( tSettings.m_iStopwordStep );
  16. tWriter.PutDword ( tSettings.m_iOvershortStep );
  17. tWriter.PutDword ( tSettings.m_iEmbeddedLimit );
  18. tWriter.PutByte ( tSettings.m_eBigramIndex );
  19. tWriter.PutString ( tSettings.m_sBigramWords );
  20. tWriter.PutByte ( tSettings.m_bIndexFieldLens );
  21. tWriter.PutByte ( tSettings.m_eChineseRLP );
  22. tWriter.PutString ( tSettings.m_sRLPContext );
  23. tWriter.PutString ( tSettings.m_sIndexTokenFilter );
  24. }
  • 写入对应的tokenizer的配置信息,
  1. void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, int iEmbeddedLimit )
  2. {
  3. assert ( pTokenizer );
  4. const CSphTokenizerSettings & tSettings = pTokenizer->GetSettings ();
  5. tWriter.PutByte ( tSettings.m_iType );
  6. tWriter.PutString ( tSettings.m_sCaseFolding.cstr () );
  7. tWriter.PutDword ( tSettings.m_iMinWordLen );
  8. bool bEmbedSynonyms = pTokenizer->GetSynFileInfo ().m_uSize<=(SphOffset_t)iEmbeddedLimit;
  9. tWriter.PutByte ( bEmbedSynonyms ? 1 : 0 );
  10. if ( bEmbedSynonyms )
  11. pTokenizer->WriteSynonyms ( tWriter );
  12. tWriter.PutString ( tSettings.m_sSynonymsFile.cstr () );
  13. WriteFileInfo ( tWriter, pTokenizer->GetSynFileInfo () );
  14. tWriter.PutString ( tSettings.m_sBoundary.cstr () );
  15. tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
  16. tWriter.PutDword ( tSettings.m_iNgramLen );
  17. tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
  18. tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
  19. tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
  20. }
  • 写入dictionary的配置信息(比如stop word之类).
  1. void SaveDictionarySettings ( CSphWriter & tWriter, CSphDict * pDict, bool bForceWordDict, int iEmbeddedLimit )
  2. {
  3. assert ( pDict );
  4. const CSphDictSettings & tSettings = pDict->GetSettings ();
  5. tWriter.PutString ( tSettings.m_sMorphology.cstr () );
  6. .............................
  7. bool bEmbedStopwords = uTotalSize<=(SphOffset_t)iEmbeddedLimit;
  8. tWriter.PutByte ( bEmbedStopwords ? 1 : 0 );
  9. if ( bEmbedStopwords )
  10. pDict->WriteStopwords ( tWriter );
  11. tWriter.PutString ( tSettings.m_sStopwords.cstr () );
  12. tWriter.PutDword ( dSWFileInfos.GetLength () );
  13. ARRAY_FOREACH ( i, dSWFileInfos )
  14. {
  15. tWriter.PutString ( dSWFileInfos[i].m_sFilename.cstr () );
  16. WriteFileInfo ( tWriter, dSWFileInfos[i] );
  17. }
  18. const CSphVector <CSphSavedFile> & dWFFileInfos = pDict->GetWordformsFileInfos ();
  19. uTotalSize = 0;
  20. ARRAY_FOREACH ( i, dWFFileInfos )
  21. uTotalSize += dWFFileInfos[i].m_uSize;
  22. bool bEmbedWordforms = uTotalSize<=(SphOffset_t)iEmbeddedLimit;
  23. tWriter.PutByte ( bEmbedWordforms ? 1 : 0 );
  24. if ( bEmbedWordforms )
  25. pDict->WriteWordforms ( tWriter );
  26. tWriter.PutDword ( dWFFileInfos.GetLength() );
  27. ARRAY_FOREACH ( i, dWFFileInfos )
  28. {
  29. tWriter.PutString ( dWFFileInfos[i].m_sFilename.cstr() );
  30. WriteFileInfo ( tWriter, dWFFileInfos[i] );
  31. }
  32. tWriter.PutDword ( tSettings.m_iMinStemmingLen );
  33. tWriter.PutByte ( tSettings.m_bWordDict || bForceWordDict );
  34. tWriter.PutByte ( tSettings.m_bStopwordsUnstemmed );
  35. tWriter.PutString ( pDict->GetMorphDataFingerprint() );
  36. }
  • 然后是写入killlist的size(m_uKillListSize)
  • 写入m_iMinMaxIndex,这个选项也就是表示document size.
  1. CSphFixedVector<CSphRowitem> dMinRow ( tNewSchema.GetRowSize() );
  2. ...............
  3. int iNewStride = DOCINFO_IDSIZE + tNewSchema.GetRowSize();
  4. int64_t iNewMinMaxIndex = m_iDocinfo * iNewStride;
  5. ..............................
  6. tBuildHeader.m_iMinMaxIndex = iNewMinMaxIndex;
  • 写入regex相关配置(regexp_filter)
  1. void SaveFieldFilterSettings ( CSphWriter & tWriter, ISphFieldFilter * pFieldFilter )
  2. {
  3. if ( !pFieldFilter )
  4. {
  5. tWriter.PutDword ( 0 );
  6. return;
  7. }
  8. CSphFieldFilterSettings tSettings;
  9. pFieldFilter->GetSettings ( tSettings );
  10. tWriter.PutDword ( tSettings.m_dRegexps.GetLength() );
  11. ARRAY_FOREACH ( i, tSettings.m_dRegexps )
  12. tWriter.PutString ( tSettings.m_dRegexps[i] );
  13. tWriter.PutByte(1); // deprecated utf8 flag
  14. }
  • 最后是写入对应的schema field长度.