TiDB Lightning Configuration

This document provides samples for global configuration and task configuration, and describes the usage of command-line parameters.

Configuration files

TiDB Lightning has two configuration classes: “global” and “task”, and they have compatible structures. Their distinction arises only when the server mode is enabled. When server mode is disabled (the default), TiDB Lightning will only execute one task, and the same configuration file is used for both global and task configurations.

TiDB Lightning (Global)

  1. ### tidb-lightning global configuration
  2. [lightning]
  3. # The HTTP port for displaying the web interface, pulling Prometheus metrics, exposing debug data,
  4. # and submitting import tasks (in server mode). Setting it to 0 disables the port.
  5. status-addr = ':8289'
  6. # Server mode. Defaults to false, which means an import task starts immediately after you execute the command.
  7. # If this value is set to true, after you execute the command,
  8. # TiDB Lightning waits until you submit an import task in the web interface.
  9. # See the "TiDB Lightning Web Interface" section for details.
  10. server-mode = false
  11. # Logging
  12. level = "info"
  13. file = "tidb-lightning.log"
  14. max-size = 128 # MB
  15. max-days = 28
  16. max-backups = 14

TiDB Lightning (Task)

  1. ### tidb-lightning task configuration
  2. [lightning]
  3. # Checks whether the cluster satisfies the minimum requirement before starting the task, and check whether TiKV has more than 10% free space left during running time.
  4. #check-requirements = true
  5. # The maximum number of engines to be opened concurrently.
  6. # Each table is split into one "index engine" to store indices, and multiple
  7. # "data engines" to store row data. These settings control the maximum
  8. # concurrent number for each type of engines.
  9. # These values affect the memory and disk usage of tikv-importer.
  10. # The sum of these two values must not exceed the max-open-engines setting
  11. # for tikv-importer.
  12. index-concurrency = 2
  13. table-concurrency = 6
  14. # The concurrency number of data. It is set to the number of logical CPU
  15. # cores by default. When deploying together with other components, you can
  16. # set it to 75% of the size of logical CPU cores to limit the CPU usage.
  17. # region-concurrency =
  18. # The maximum I/O concurrency. Excessive I/O concurrency causes an increase in
  19. # I/O latency because the disk's internal buffer is frequently refreshed,
  20. # which causes the cache miss and slows down the read speed. Depending on the storage
  21. # medium, this value might need to be adjusted for optimal performance.
  22. io-concurrency = 5
  23. # The maximum number of non-fatal errors to tolerate before stopping TiDB Lightning.
  24. # Non-fatal errors are localized to a few rows, and ignoring those rows allows the import process to continue.
  25. # Setting this to N means that TiDB Lightning will stop as soon as possible when the (N+1)-th error is encountered.
  26. # The skipped rows will be inserted into tables inside the "task info" schema on the target TiDB, which can be configured below.
  27. # The default value is `MaxInt64` bytes, that is, 9223372036854775807 bytes.
  28. max-error = 0
  29. # task-info-schema-name is the name of the schema or database that stores TiDB Lightning execution results.
  30. # To disable error recording, set this to an empty string.
  31. # task-info-schema-name = 'lightning_task_info'
  32. # In parallel import mode, the schema name that stores the meta information for each TiDB Lightning instance in the target cluster.
  33. # By default, the value is "lightning_metadata".
  34. # Configure this parameter only if parallel import is enabled.
  35. # **Note:**
  36. # - The value set for this parameter must be the same for each TiDB Lightning instance
  37. # that participates in the same parallel import; otherwise, the correctness of the imported data cannot be ensured.
  38. # - If parallel import mode is enabled, make sure that the user used for import (for the tidb.user configuration)
  39. # has permissions to create and access the databases corresponding to this configuration.
  40. # - TiDB Lightning removes this schema after the import is completed.
  41. # So do not use any existing schema name to configure this parameter.
  42. meta-schema-name = "lightning_metadata"
  43. [security]
  44. # Specifies certificates and keys for TLS connections within the cluster.
  45. # Public certificate of the CA. Leave empty to disable TLS.
  46. # ca-path = "/path/to/ca.pem"
  47. # Public certificate of this service.
  48. # cert-path = "/path/to/lightning.pem"
  49. # Private key of this service.
  50. # key-path = "/path/to/lightning.key"
  51. [checkpoint]
  52. # Whether to enable checkpoints.
  53. # While importing data, TiDB Lightning records which tables have been imported, so
  54. # even if TiDB Lightning or another component crashes, you can start from a known
  55. # good state instead of restarting from scratch.
  56. enable = true
  57. # The schema name (database name) to store the checkpoints.
  58. schema = "tidb_lightning_checkpoint"
  59. # Where to store the checkpoints.
  60. # - file: store as a local file.
  61. # - mysql: store into a remote MySQL-compatible database
  62. driver = "file"
  63. # The data source name (DSN) indicating the location of the checkpoint storage.
  64. # For the "file" driver, the DSN is a path. If the path is not specified, TiDB Lightning would
  65. # default to "/tmp/CHECKPOINT_SCHEMA.pb".
  66. # For the "mysql" driver, the DSN is a URL in the form of "USER:PASS@tcp(HOST:PORT)/".
  67. # If the URL is not specified, the TiDB server from the [tidb] section is used to
  68. # store the checkpoints. You should specify a different MySQL-compatible
  69. # database server to reduce the load of the target TiDB cluster.
  70. # dsn = "/tmp/tidb_lightning_checkpoint.pb"
  71. # Whether to keep the checkpoints after all data are imported. If false, the
  72. # checkpoints will be deleted. Keeping the checkpoints can aid debugging but
  73. # will leak metadata about the data source.
  74. # keep-after-success = false
  75. [tikv-importer]
  76. # "local": Physical import mode, used by default. It applies to large dataset import,
  77. # for example, greater than 1 TiB. However, during the import, downstream TiDB is not available to provide services.
  78. # "tidb": Logical import mode. You can use this mode for small dataset import,
  79. # for example, smaller than 1 TiB. During the import, downstream TiDB is available to provide services.
  80. # backend = "local"
  81. # Whether to enable multiple TiDB Lightning instances (in physical import mode) to import data to one or more target tables in parallel.
  82. # The default value is `false`.
  83. # When you use parallel import mode, you must set the parameter to `true`,
  84. # but the premise is that no data exists in the target table, that is, all data can only be imported by TiDB Lightning.
  85. # Note that this parameter **is not for incremental data import** and is only used in scenarios where the target table is empty.
  86. # incremental-import = false
  87. # The listening address of tikv-importer when backend is "importer". Change it to the actual address.
  88. addr = "172.16.31.10:8287"
  89. # Action to do when trying to insert a conflicting record in the logical import mode.
  90. # For more information on the conflict detection, see the document: https://docs.pingcap.com/tidb/v7.1/tidb-lightning-logical-import-mode-usage#conflict-detection
  91. # - replace: use new entry to replace the existing entry
  92. # - ignore: keep the existing entry, and ignore the new entry
  93. # - error: report error and quit the program
  94. # on-duplicate = "replace"
  95. # Whether to detect and resolve duplicate records (unique key conflict) in the physical import mode.
  96. # The following resolution algorithms are supported:
  97. # - none: does not detect duplicate records, which has the best performance of the two algorithms.
  98. # But if there are duplicate records in the data source, it might lead to inconsistent data in the target TiDB.
  99. # - remove: if there are primary key or unique key conflicts between the inserting data A and B,
  100. # A and B will be removed from the target table and recorded
  101. # in the `lightning_task_info.conflict_error_v1` table in the target TiDB.
  102. # You can manually insert the correct records into the target table based on your business requirements.
  103. # Note that the target TiKV must be v5.2.0 or later versions; otherwise it falls back to 'none'.
  104. # The default value is 'none'.
  105. # duplicate-resolution = 'none'
  106. # The number of KV pairs sent in one request in the physical import mode.
  107. # send-kv-pairs = 3200
  108. # Whether to enable compression when sending KV pairs to TiKV in the physical import mode.
  109. # Currently, only the Gzip compression algorithm is supported.
  110. # To use this algorithm, you can fill in either "gzip" or "gz" for this parameter.
  111. # By default, the compression is not enabled.
  112. # compress-kv-pairs = ""
  113. # The directory of local KV sorting in the physical import mode. If the disk
  114. # performance is low (such as in HDD), it is recommended to set the directory
  115. # on a different disk from `data-source-dir` to improve import speed.
  116. # sorted-kv-dir = ""
  117. # The concurrency that TiKV writes KV data in the physical import mode.
  118. # When the network transmission speed between TiDB Lightning and TiKV
  119. # exceeds 10 Gigabit, you can increase this value accordingly.
  120. # range-concurrency = 16
  121. # Limits the bandwidth in which TiDB Lightning writes data into each TiKV
  122. # node in the physical import mode. 0 by default, which means no limit.
  123. # store-write-bwlimit = "128MiB"
  124. # Specifies the disk quota for local temporary files when physical import mode is used.
  125. # When the disk quota is insufficient, TiDB Lightning stops reading source data and writing temporary files,
  126. # but prioritizes writing the already sorted key-value pairs to TiKV.
  127. # After TiDB Lightning deletes the local temporary files, the import process continues.
  128. # This option takes effect only when you set the `backend` option to `local`.
  129. # The default value is `MaxInt64` bytes, that is, 9223372036854775807 bytes.
  130. # disk-quota = "10GB"
  131. # Specifies whether Physical Import Mode adds indexes via SQL.
  132. # The default value is `false`, which means that TiDB Lightning will encode both row data and index data
  133. # into KV pairs and import them into TiKV together.
  134. # This mechanism is consistent with that of the historical versions.
  135. # If you set it to `true`, it means that TiDB Lightning adds indexes via SQL after importing the row data.
  136. # The benefit of adding indexes via SQL is that you can separately import data and import indexes,
  137. # and import data more quickly. After the data is imported, even if the indexes fail to be added,
  138. # it does not affect the consistency of the imported data.
  139. # add-index-by-sql = false
  140. # When you use TiDB Lightning to import a multi-tenant TiDB cluster, use this parameter to specify the corresponding key space name.
  141. # The default value is an empty string, which means TiDB Lightning will automatically get the key space name of the corresponding tenant to import data.
  142. # If you specify a value, the specified key space name will be used to import data.
  143. # keyspace-name = ""
  144. # In Physical Import Mode, this parameter controls the scope in which TiDB Lightning stops PD scheduling.
  145. # The value options are as follows:
  146. # - "table": pause scheduling only for the Region that stores the target table data. The default value is "table".
  147. # - "global": pause global scheduling. When importing data to a cluster without any business traffic,
  148. # it is recommended to set this parameter to "global" to avoid interference from other scheduling.
  149. # pause-pd-scheduler-scope = "table"
  150. # In Physical Import Mode, this parameter controls the number of Regions when splitting Regions in a batch.
  151. # The maximum number of Regions that can be split at the same time per TiDB Lightning instance is:
  152. # region-split-batch-size * region-split-concurrency * table-concurrency
  153. # This parameter is introduced in v7.1.0. The default value is `4096`.
  154. # region-split-batch-size = 4096
  155. # In Physical Import Mode, this parameter controls the concurrency when splitting Regions.
  156. # The default value is the number of CPU cores.
  157. # This parameter is introduced in v7.1.0.
  158. # region-split-concurrency =
  159. # In Physical Import Mode, this parameter controls the number of retries to wait for the Region to come online
  160. # after the split and scatter operations.
  161. # The default value is `1800` and the maximum retry interval is two seconds.
  162. # The number of retries will not be increased if any Region becomes online between retries.
  163. # This parameter is introduced in v7.1.0.
  164. # region-check-backoff-limit = 1800
  165. [mydumper]
  166. # Block size for file reading. Keep it longer than the longest string of the data source.
  167. read-block-size = "64KiB" # default value
  168. # The engine file needs to be imported sequentially. Due to parallel processing,
  169. # multiple data engines will be imported at nearly the same time, and this
  170. # creates a queue and wastes resources. Therefore, TiDB Lightning slightly
  171. # increases the size of the first few batches to properly distribute
  172. # resources. The scale up factor is controlled by this parameter, which
  173. # expresses the ratio of duration between the "import" and "write" steps
  174. # with full concurrency. This can be calculated by using the ratio
  175. # (import duration/write duration) of a single table of size around 1 GiB.
  176. # The exact timing can be found in the log. If "import" is faster, the batch
  177. # size variance is smaller, and a ratio of zero means a uniform batch size.
  178. # This value should be in the range (0 <= batch-import-ratio < 1).
  179. batch-import-ratio = 0.75
  180. # Local source data directory or the URI of the external storage.
  181. # For more information about the URI of the external storage, see https://docs.pingcap.com/tidb/v6.6/backup-and-restore-storages#uri-format.
  182. data-source-dir = "/data/my_database"
  183. # The character set of the schema files, containing CREATE TABLE statements;
  184. # only supports one of:
  185. # - utf8mb4: the schema files must be encoded as UTF-8; otherwise, an error is reported.
  186. # - gb18030: the schema files must be encoded as GB-18030; otherwise,
  187. # an error is reported
  188. # - auto: (default) automatically detects whether the schema is UTF-8 or
  189. # GB-18030. An error is reported if the encoding is neither.
  190. # - binary: do not try to decode the schema files
  191. character-set = "auto"
  192. # Specifies the character set of the source data file.
  193. # Lightning converts the source file from the specified character set to UTF-8 encoding when importing.
  194. # Currently, this configuration only specifies the character set of the CSV files with the following options supported:
  195. # - utf8mb4: Indicates that the source data file uses UTF-8 encoding.
  196. # - GB18030: Indicates that the source data file uses the GB-18030 encoding.
  197. # - GBK: The source data file uses GBK encoding (GBK encoding is an extension of the GB-2312 character set, also known as Code Page 936).
  198. # - binary: Indicates that Lightning does not convert the encoding (by default).
  199. # If left blank, the default value "binary" is used, that is to say, Lightning does not convert the encoding.
  200. # Note that Lightning does not predict about the character set of the source data file
  201. # and only converts the source file and import the data based on this configuration.
  202. # If the value of this configuration is not the same as the actual encoding of the source data file,
  203. # a failed import, data loss or data disorder might appear.
  204. data-character-set = "binary"
  205. # Specifies the replacement character in case of incompatible characters during the character set conversion of the source data file.
  206. # This configuration must not be duplicated with field separators, quote definers, and line breaks.
  207. # The default value is "\uFFFD", which is the "error" Rune or Unicode replacement character in UTF-8 encoding.
  208. # Changing the default value might result in potential degradation of parsing performance for the source data file.
  209. data-invalid-char-replace = "\uFFFD"
  210. # the input data in a "strict" format speeds up processing.
  211. # "strict-format = true" requires that:
  212. # in CSV, every value cannot contain literal new lines (U+000A and U+000D, or \r and \n) even
  213. # when quoted, which means new lines are strictly used to separate rows.
  214. # "Strict" format allows TiDB Lightning to quickly locate split positions of a large file for parallel processing.
  215. # However, if the input data is not "strict", it may split a valid data in half and
  216. # corrupt the result.
  217. # The default value is false for safety instead of speed.
  218. strict-format = false
  219. # If strict-format is true, TiDB Lightning splits large CSV files into multiple chunks to process in
  220. # parallel. max-region-size is the maximum size of each chunk after splitting.
  221. # max-region-size = "256MiB" # default value
  222. # Only import tables if these wildcard rules are matched. See the corresponding section for details.
  223. filter = ['*.*', '!mysql.*', '!sys.*', '!INFORMATION_SCHEMA.*', '!PERFORMANCE_SCHEMA.*', '!METRICS_SCHEMA.*', '!INSPECTION_SCHEMA.*']
  224. # Configures how CSV files are parsed.
  225. [mydumper.csv]
  226. # Separator between fields. Must not be empty.
  227. separator = ','
  228. # Quoting delimiter. Empty value means no quoting.
  229. delimiter = '"'
  230. # Line terminator. Empty value means both "\n" (LF) and "\r\n" (CRLF) are line terminators.
  231. terminator = ''
  232. # Whether the CSV files contain a header.
  233. # If `header` is true, TiDB Lightning treats the first row as a table header and does not import it as data.
  234. # If `header` is false, the first row is also imported as CSV data.
  235. header = true
  236. # Whether the column names in the CSV file header are matched to those defined in the target table.
  237. # The default value is `true`, which means that you have confirmed that the column names in the CSV header
  238. # are consistent with those in the target table, so that even if the order of the columns is different between the two,
  239. # TiDB Lightning can still import the data successfully by mapping the column names.
  240. # If the column names between the CSV table header and the target table do not match
  241. # (for example, some column names in the CSV table header cannot be found in the target table)
  242. # but the column order is the same, set this configuration to `false`.
  243. # In this scenario, TiDB Lightning will ignore the CSV header to avoid errors and import the data
  244. # directly in the order of the columns in the target table.
  245. # Therefore, if the columns are not in the same order,
  246. # you need to manually adjust the order of the columns in the CSV file to be consistent with that
  247. # in the target table before importing;
  248. # otherwise data discrepancies might occur.
  249. # It is important to note that this parameter only applies if the `header` parameter is set to `true`.
  250. # If `header` is set to `false`, it means that the CSV file does not contain a header,
  251. # so this parameter is not relevant.
  252. header-schema-match = true
  253. # Whether the CSV contains any NULL value.
  254. # If `not-null` is true, all columns from CSV cannot be NULL.
  255. not-null = false
  256. # When `not-null` is false (that is, CSV can contain NULL),
  257. # fields equal to this value will be treated as NULL.
  258. null = '\N'
  259. # Whether to interpret backslash escapes inside fields.
  260. backslash-escape = true
  261. # If a line ends with a separator, remove it.
  262. trim-last-separator = false
  263. # [[mydumper.files]]
  264. # Expression used for parsing AWS Aurora parquet files
  265. # pattern = '(?i)^(?:[^/]*/)*([a-z0-9_]+)\.([a-z0-9_]+)/(?:[^/]*/)*(?:[a-z0-9\-_.]+\.(parquet))$'
  266. # schema = '$1'
  267. # table = '$2'
  268. # type = '$3'
  269. [tidb]
  270. # Configuration of any TiDB server from the cluster.
  271. host = "172.16.31.1"
  272. port = 4000
  273. user = "root"
  274. # Configure the password to connect to TiDB. The password can either be plaintext or Base64 encoded.
  275. password = ""
  276. # Table schema information is fetched from TiDB via this status-port.
  277. status-port = 10080
  278. # Address of any PD server from the cluster.
  279. pd-addr = "172.16.31.4:2379"
  280. # tidb-lightning imports TiDB as a library and generates some logs itself.
  281. # This setting controls the log level of the TiDB library.
  282. log-level = "error"
  283. # Sets the TiDB session variable to speed up the Checksum and Analyze operations. Note that if checksum-via-sql is set to "true", TiDB Lightning will execute the ADMIN CHECKSUM TABLE <table> SQL statement to perform the Checksum operation on TiDB. In this case, the following parameters `distsql-scan-concurrency` and `checksum-table-concurrency` will not take effect.
  284. # See https://docs.pingcap.com/tidb/stable/statistics#control-analyze-concurrency
  285. # for the meaning of each setting
  286. build-stats-concurrency = 20
  287. distsql-scan-concurrency = 15
  288. index-serial-scan-concurrency = 20
  289. checksum-table-concurrency = 2
  290. # The default SQL mode used to parse and execute the SQL statements.
  291. sql-mode = "ONLY_FULL_GROUP_BY,NO_AUTO_CREATE_USER"
  292. # Sets maximum packet size allowed for SQL connections.
  293. # Set this to 0 to automatically fetch the `max_allowed_packet` variable from server on every connection.
  294. max-allowed-packet = 67_108_864
  295. # Whether to use TLS for SQL connections. Valid values are:
  296. # - "": if configuration items in the [tidb.security] section are configured, TiDB Lightning requires TLS for SQL connections (same behavior as "cluster"). Otherwise, it uses an unencrypted connection.
  297. # - "false": same behavior as "".
  298. # - "cluster": requires TLS and verifies the server's certificate with the CA specified in the [tidb.security] section.
  299. # - "skip-verify": requires TLS but does not verify the server's certificate (insecure). If the server does not support TLS, the connection falls back to an unencrypted state.
  300. # - "preferred": same behavior as "skip-verify".
  301. # tls = ""
  302. # Specifies certificates and keys for TLS-enabled MySQL connections.
  303. # Defaults to a copy of the [security] section.
  304. # [tidb.security]
  305. # Public certificate of the CA. Set to empty string to disable TLS for SQL.
  306. # ca-path = "/path/to/ca.pem"
  307. # Public certificate of this service. Default to copy of `security.cert-path`
  308. # cert-path = "/path/to/lightning.pem"
  309. # Private key of this service. Default to copy of `security.key-path`
  310. # key-path = "/path/to/lightning.key"
  311. # In the physical import mode, when data importing is complete, TiDB Lightning can
  312. # automatically perform the Checksum and Analyze operations. It is recommended
  313. # to leave these as true in the production environment.
  314. # The execution order: Checksum -> Analyze.
  315. # Note that in the logical import mode, Checksum and Analyze is not needed, and they are always
  316. # skipped in the actual operation.
  317. [post-restore]
  318. # Specifies whether to perform `ADMIN CHECKSUM TABLE <table>` for each table to verify data integrity after importing.
  319. # The following options are available:
  320. # - "required" (default value): Perform admin checksum. If checksum fails, TiDB Lightning will exit with failure.
  321. # - "optional": Perform admin checksum. If checksum fails, TiDB Lightning will report a WARN log but ignore any error.
  322. # - "off": Do not perform checksum.
  323. # Note that since v4.0.8, the default value has changed from "true" to "required".
  324. # Note:
  325. # 1. Checksum failure usually means import exception (data loss or inconsistency). It is recommended to always enable checksum.
  326. # 2. For backward compatibility, bool values "true" and "false" are also allowed for this field.
  327. # "true" is equivalent to "required" and "false" is equivalent to "off".
  328. checksum = "required"
  329. # Specifies whether the ADMIN CHECKSUM TABLE <table> operation is executed via TiDB.
  330. # The default value is "false", which means that the ADMIN CHECKSUM TABLE <table> command is sent to TiKV for execution via TiDB Lightning.
  331. # It is recommended that you set this value to "true" to make it easier to locate the problem if checksum fails.
  332. # Meanwhile, if you want to adjust concurrency when this value is "true", you need to set the `tidb_checksum_table_concurrency` variable in TiDB (https://docs.pingcap.com/tidb/stable/system-variables#tidb_checksum_table_concurrency).
  333. checksum-via-sql = "false"
  334. # Specifies whether to perform `ANALYZE TABLE <table>` for each table after checksum is done.
  335. # Options available for this field are the same as `checksum`. However, the default value for this field is "optional".
  336. analyze = "optional"
  337. # Configures the background periodic actions.
  338. # Supported units: h (hour), m (minute), s (second).
  339. [cron]
  340. # Duration between which TiDB Lightning automatically refreshes the import mode
  341. # status. Should be shorter than the corresponding TiKV setting.
  342. switch-mode = "5m"
  343. # Duration between which an import progress is printed to the log.
  344. log-progress = "5m"
  345. # The time interval for checking the local disk quota when you use the physical import mode.
  346. # The default value is 60 seconds.
  347. # check-disk-quota = "60s"