航班飞行数据

下载数据:

  1. for s in `seq 1987 2018`
  2. do
  3. for m in `seq 1 12`
  4. do
  5. wget https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_${s}_${m}.zip
  6. done
  7. done

(引用 https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh )

创建表结构:

  1. CREATE TABLE `ontime` (
  2. `Year` UInt16,
  3. `Quarter` UInt8,
  4. `Month` UInt8,
  5. `DayofMonth` UInt8,
  6. `DayOfWeek` UInt8,
  7. `FlightDate` Date,
  8. `UniqueCarrier` FixedString(7),
  9. `AirlineID` Int32,
  10. `Carrier` FixedString(2),
  11. `TailNum` String,
  12. `FlightNum` String,
  13. `OriginAirportID` Int32,
  14. `OriginAirportSeqID` Int32,
  15. `OriginCityMarketID` Int32,
  16. `Origin` FixedString(5),
  17. `OriginCityName` String,
  18. `OriginState` FixedString(2),
  19. `OriginStateFips` String,
  20. `OriginStateName` String,
  21. `OriginWac` Int32,
  22. `DestAirportID` Int32,
  23. `DestAirportSeqID` Int32,
  24. `DestCityMarketID` Int32,
  25. `Dest` FixedString(5),
  26. `DestCityName` String,
  27. `DestState` FixedString(2),
  28. `DestStateFips` String,
  29. `DestStateName` String,
  30. `DestWac` Int32,
  31. `CRSDepTime` Int32,
  32. `DepTime` Int32,
  33. `DepDelay` Int32,
  34. `DepDelayMinutes` Int32,
  35. `DepDel15` Int32,
  36. `DepartureDelayGroups` String,
  37. `DepTimeBlk` String,
  38. `TaxiOut` Int32,
  39. `WheelsOff` Int32,
  40. `WheelsOn` Int32,
  41. `TaxiIn` Int32,
  42. `CRSArrTime` Int32,
  43. `ArrTime` Int32,
  44. `ArrDelay` Int32,
  45. `ArrDelayMinutes` Int32,
  46. `ArrDel15` Int32,
  47. `ArrivalDelayGroups` Int32,
  48. `ArrTimeBlk` String,
  49. `Cancelled` UInt8,
  50. `CancellationCode` FixedString(1),
  51. `Diverted` UInt8,
  52. `CRSElapsedTime` Int32,
  53. `ActualElapsedTime` Int32,
  54. `AirTime` Int32,
  55. `Flights` Int32,
  56. `Distance` Int32,
  57. `DistanceGroup` UInt8,
  58. `CarrierDelay` Int32,
  59. `WeatherDelay` Int32,
  60. `NASDelay` Int32,
  61. `SecurityDelay` Int32,
  62. `LateAircraftDelay` Int32,
  63. `FirstDepTime` String,
  64. `TotalAddGTime` String,
  65. `LongestAddGTime` String,
  66. `DivAirportLandings` String,
  67. `DivReachedDest` String,
  68. `DivActualElapsedTime` String,
  69. `DivArrDelay` String,
  70. `DivDistance` String,
  71. `Div1Airport` String,
  72. `Div1AirportID` Int32,
  73. `Div1AirportSeqID` Int32,
  74. `Div1WheelsOn` String,
  75. `Div1TotalGTime` String,
  76. `Div1LongestGTime` String,
  77. `Div1WheelsOff` String,
  78. `Div1TailNum` String,
  79. `Div2Airport` String,
  80. `Div2AirportID` Int32,
  81. `Div2AirportSeqID` Int32,
  82. `Div2WheelsOn` String,
  83. `Div2TotalGTime` String,
  84. `Div2LongestGTime` String,
  85. `Div2WheelsOff` String,
  86. `Div2TailNum` String,
  87. `Div3Airport` String,
  88. `Div3AirportID` Int32,
  89. `Div3AirportSeqID` Int32,
  90. `Div3WheelsOn` String,
  91. `Div3TotalGTime` String,
  92. `Div3LongestGTime` String,
  93. `Div3WheelsOff` String,
  94. `Div3TailNum` String,
  95. `Div4Airport` String,
  96. `Div4AirportID` Int32,
  97. `Div4AirportSeqID` Int32,
  98. `Div4WheelsOn` String,
  99. `Div4TotalGTime` String,
  100. `Div4LongestGTime` String,
  101. `Div4WheelsOff` String,
  102. `Div4TailNum` String,
  103. `Div5Airport` String,
  104. `Div5AirportID` Int32,
  105. `Div5AirportSeqID` Int32,
  106. `Div5WheelsOn` String,
  107. `Div5TotalGTime` String,
  108. `Div5LongestGTime` String,
  109. `Div5WheelsOff` String,
  110. `Div5TailNum` String
  111. ) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192)

加载数据:

  1. for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done

查询:

Q0.

  1. select avg(c1) from (select Year, Month, count(*) as c1 from ontime group by Year, Month);

Q1. 查询从2000年到2008年每天的航班数

  1. SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC;

Q2. 查询从2000年到2008年每周延误超过10分钟的航班数。

  1. SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC

Q3. 查询2000年到2008年每个机场延误超过10分钟以上的次数

  1. SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY Origin ORDER BY c DESC LIMIT 10

Q4. 查询2007年各航空公司延误超过10分钟以上的次数

  1. SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year = 2007 GROUP BY Carrier ORDER BY count(*) DESC

Q5. 查询2007年各航空公司延误超过10分钟以上的百分比

  1. SELECT Carrier, c, c2, c*100/c2 as c3
  2. FROM
  3. (
  4. SELECT
  5. Carrier,
  6. count(*) AS c
  7. FROM ontime
  8. WHERE DepDelay>10
  9. AND Year=2007
  10. GROUP BY Carrier
  11. )
  12. ANY INNER JOIN
  13. (
  14. SELECT
  15. Carrier,
  16. count(*) AS c2
  17. FROM ontime
  18. WHERE Year=2007
  19. GROUP BY Carrier
  20. ) USING Carrier
  21. ORDER BY c3 DESC;

更好的查询版本:

  1. SELECT Carrier, avg(DepDelay > 10) * 100 AS c3 FROM ontime WHERE Year = 2007 GROUP BY Carrier ORDER BY Carrier

Q6. 同上一个查询一致,只是查询范围扩大到2000年到2008年

  1. SELECT Carrier, c, c2, c*100/c2 as c3
  2. FROM
  3. (
  4. SELECT
  5. Carrier,
  6. count(*) AS c
  7. FROM ontime
  8. WHERE DepDelay>10
  9. AND Year >= 2000 AND Year <= 2008
  10. GROUP BY Carrier
  11. )
  12. ANY INNER JOIN
  13. (
  14. SELECT
  15. Carrier,
  16. count(*) AS c2
  17. FROM ontime
  18. WHERE Year >= 2000 AND Year <= 2008
  19. GROUP BY Carrier
  20. ) USING Carrier
  21. ORDER BY c3 DESC;

更好的查询版本:

  1. SELECT Carrier, avg(DepDelay > 10) * 100 AS c3 FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY Carrier ORDER BY Carrier

Q7. 每年航班延误超过10分钟的百分比

  1. SELECT Year, c1/c2
  2. FROM
  3. (
  4. select
  5. Year,
  6. count(*)*100 as c1
  7. from ontime
  8. WHERE DepDelay>10
  9. GROUP BY Year
  10. )
  11. ANY INNER JOIN
  12. (
  13. select
  14. Year,
  15. count(*) as c2
  16. from ontime
  17. GROUP BY Year
  18. ) USING (Year)
  19. ORDER BY Year

更好的查询版本:

  1. SELECT Year, avg(DepDelay > 10) FROM ontime GROUP BY Year ORDER BY Year

Q8. 每年更受人们喜爱的目的地

  1. SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime WHERE Year >= 2000 and Year <= 2010 GROUP BY DestCityName ORDER BY u DESC LIMIT 10;

Q9.

  1. select Year, count(*) as c1 from ontime group by Year;

Q10.

  1. select
  2. min(Year), max(Year), Carrier, count(*) as cnt,
  3. sum(ArrDelayMinutes>30) as flights_delayed,
  4. round(sum(ArrDelayMinutes>30)/count(*),2) as rate
  5. FROM ontime
  6. WHERE
  7. DayOfWeek not in (6,7) and OriginState not in ('AK', 'HI', 'PR', 'VI')
  8. and DestState not in ('AK', 'HI', 'PR', 'VI')
  9. and FlightDate < '2010-01-01'
  10. GROUP by Carrier
  11. HAVING cnt > 100000 and max(Year) > 1990
  12. ORDER by rate DESC
  13. LIMIT 1000;

Bonus:

  1. SELECT avg(cnt) FROM (SELECT Year,Month,count(*) AS cnt FROM ontime WHERE DepDel15=1 GROUP BY Year,Month)
  2. select avg(c1) from (select Year,Month,count(*) as c1 from ontime group by Year,Month)
  3. SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime GROUP BY DestCityName ORDER BY u DESC LIMIT 10;
  4. SELECT OriginCityName, DestCityName, count() AS c FROM ontime GROUP BY OriginCityName, DestCityName ORDER BY c DESC LIMIT 10;
  5. SELECT OriginCityName, count() AS c FROM ontime GROUP BY OriginCityName ORDER BY c DESC LIMIT 10;

这个性能测试由Vadim Tkachenko提供。参考: