集成 Hive血缘

本文主要介绍在 Linkis 中, Hive 引擎血缘采集方案。

Hive 提供了一个内置的Hook LineageLogger,它用于记录查询执行期间生成的血缘信息。通过使用 LineageLogger Hook,你可以捕获和记录查询的输入和输出表以及列级别的血缘关系

  1. vim $HIVE_HOME/conf/hive-site.xml
  2. 增加如下配置
  3. <property>
  4. <name>hive.exec.post.hooks</name>
  5. <value>org.apache.hadoop.hive.ql.hooks.LineageLogger</value>
  6. </property>
  1. vim $HIVE_HOME/conf/hive-log4j2.properties
  2. 增加如下配置
  3. log4j.logger.org.apache.hadoop.hive.ql.hooks.LineageLogger=INFO
  1. sh ./bin/linkis-cli -engineType hive-3.1.3 \
  2. -codeType hql -code \
  3. "CREATE TABLE input_table (
  4. column1 INT,
  5. column2 STRING
  6. );
  7. CREATE TABLE output_table (
  8. column3 INT,
  9. column4 STRING
  10. );
  11. INSERT INTO TABLE output_table
  12. SELECT column1, column2
  13. FROM input_table;" \
  14. -submitUser hadoop -proxyUser hadoop
  1. cat /appcom/tmp/hadoop/20230922/hive/946375fe-f189-487c-b3a7-f9fa821edace/logs/stdout

输出结果如下: hive-lineage-log

详细信息如下:

  1. {
  2. "version":"1.0",
  3. "user":"hadoop",
  4. "timestamp":1695354104,
  5. "duration":15318,
  6. "jobIds":[
  7. "job_1691375506204_0488"
  8. ],
  9. "engine":"mr",
  10. "database":"default",
  11. "hash":"dbb11fce57f10dccb6ef724f66af611c",
  12. "queryText":"INSERT INTO TABLE output_table\nSELECT column1, column2\nFROM input_table",
  13. "edges":[
  14. {
  15. "sources":[
  16. 2
  17. ],
  18. "targets":[
  19. 0
  20. ],
  21. "edgeType":"PROJECTION"
  22. },
  23. {
  24. "sources":[
  25. 3
  26. ],
  27. "targets":[
  28. 1
  29. ],
  30. "edgeType":"PROJECTION"
  31. },
  32. {
  33. "sources":[
  34. 2
  35. ],
  36. "targets":[
  37. 0
  38. ],
  39. "expression":"compute_stats(default.input_table.column1, 'hll')",
  40. "edgeType":"PROJECTION"
  41. },
  42. {
  43. "sources":[
  44. 3
  45. ],
  46. "targets":[
  47. 1
  48. ],
  49. "expression":"compute_stats(default.input_table.column2, 'hll')",
  50. "edgeType":"PROJECTION"
  51. }
  52. ],
  53. "vertices":[
  54. {
  55. "id":0,
  56. "vertexType":"COLUMN",
  57. "vertexId":"default.output_table.column3"
  58. },
  59. {
  60. "id":1,
  61. "vertexType":"COLUMN",
  62. "vertexId":"default.output_table.column4"
  63. },
  64. {
  65. "id":2,
  66. "vertexType":"COLUMN",
  67. "vertexId":"default.input_table.column1"
  68. },
  69. {
  70. "id":3,
  71. "vertexType":"COLUMN",
  72. "vertexId":"default.input_table.column2"
  73. }
  74. ]
  75. }