Installation Hive lineage

This paper mainly introduces the ‘Hive’ engine blood collection scheme in ‘Linkis’.

Hive provides a built-in lineage hook called LineageLogger, which is used to capture and record lineage information generated during query execution. By using the LineageLogger hook, you can capture and log the input and output tables, as well as column-level lineage relationships for queries.

  1. vim $HIVE_HOME/conf/hive-site.xml
  2. Add the following configuration
  3. <property>
  4. <name>hive.exec.post.hooks</name>
  5. <value>org.apache.hadoop.hive.ql.hooks.LineageLogger</value>
  6. </property>
  1. vim $HIVE_HOME/conf/hive-log4j2.properties
  2. Add the following configuration
  3. og4j.logger.org.apache.hadoop.hive.ql.hooks.LineageLogger=INFO
  1. sh ./bin/linkis-cli -engineType hive-3.1.3 \
  2. -codeType hql -code \
  3. "CREATE TABLE input_table (
  4. column1 INT,
  5. column2 STRING
  6. );
  7. CREATE TABLE output_table (
  8. column3 INT,
  9. column4 STRING
  10. );
  11. INSERT INTO TABLE output_table
  12. SELECT column1, column2
  13. FROM input_table;" \
  14. -submitUser hadoop -proxyUser hadoop
  1. cat /appcom/tmp/hadoop/20230922/hive/946375fe-f189-487c-b3a7-f9fa821edace/logs/stdout

The output is as follows: hive-lineage-log

Details are as follows:

  1. {
  2. "version":"1.0",
  3. "user":"hadoop",
  4. "timestamp":1695354104,
  5. "duration":15318,
  6. "jobIds":[
  7. "job_1691375506204_0488"
  8. ],
  9. "engine":"mr",
  10. "database":"default",
  11. "hash":"dbb11fce57f10dccb6ef724f66af611c",
  12. "queryText":"INSERT INTO TABLE output_table\nSELECT column1, column2\nFROM input_table",
  13. "edges":[
  14. {
  15. "sources":[
  16. 2
  17. ],
  18. "targets":[
  19. 0
  20. ],
  21. "edgeType":"PROJECTION"
  22. },
  23. {
  24. "sources":[
  25. 3
  26. ],
  27. "targets":[
  28. 1
  29. ],
  30. "edgeType":"PROJECTION"
  31. },
  32. {
  33. "sources":[
  34. 2
  35. ],
  36. "targets":[
  37. 0
  38. ],
  39. "expression":"compute_stats(default.input_table.column1, 'hll')",
  40. "edgeType":"PROJECTION"
  41. },
  42. {
  43. "sources":[
  44. 3
  45. ],
  46. "targets":[
  47. 1
  48. ],
  49. "expression":"compute_stats(default.input_table.column2, 'hll')",
  50. "edgeType":"PROJECTION"
  51. }
  52. ],
  53. "vertices":[
  54. {
  55. "id":0,
  56. "vertexType":"COLUMN",
  57. "vertexId":"default.output_table.column3"
  58. },
  59. {
  60. "id":1,
  61. "vertexType":"COLUMN",
  62. "vertexId":"default.output_table.column4"
  63. },
  64. {
  65. "id":2,
  66. "vertexType":"COLUMN",
  67. "vertexId":"default.input_table.column1"
  68. },
  69. {
  70. "id":3,
  71. "vertexType":"COLUMN",
  72. "vertexId":"default.input_table.column2"
  73. }
  74. ]
  75. }