5. 连接SQL数据库

  1. # 在读取chinook数据库之前,需要创建SQLAlchemy引擎
  2. In[108]: from sqlalchemy import create_engine
  3. engine = create_engine('sqlite:///data/chinook.db')
  4. In[109]: tracks = pd.read_sql_table('tracks', engine)
  5. tracks.head()
  6. Out[109]:

5. 连接SQL数据库 - 图1

  1. # read_sql_table函数可以读取一张表,第一个参数是表名,第二个参数是引擎
  2. In[110]: genres = pd.read_sql_table('genres', engine)
  3. genres.head()
  4. Out[110]:

5. 连接SQL数据库 - 图2

  1. # 找到每种类型歌曲的平均时长
  2. In[111]: genre_track = genres.merge(tracks[['GenreId', 'Milliseconds']],
  3. on='GenreId', how='left') \
  4. .drop('GenreId', axis='columns')
  5. genre_track.head()
  6. Out[111]:

5. 连接SQL数据库 - 图3

  1. # 将Milliseconds列转变为timedelta数据类型
  2. In[112]: genre_time = genre_track.groupby('Name')['Milliseconds'].mean()
  3. pd.to_timedelta(genre_time, unit='ms').dt.floor('s').sort_values()
  4. Out[112]:
  5. Name
  6. Rock And Roll 00:02:14
  7. Opera 00:02:54
  8. Hip Hop/Rap 00:02:58
  9. Easy Listening 00:03:09
  10. Bossa Nova 00:03:39
  11. R&B/Soul 00:03:40
  12. World 00:03:44
  13. Pop 00:03:49
  14. Latin 00:03:52
  15. Alternative & Punk 00:03:54
  16. Soundtrack 00:04:04
  17. Reggae 00:04:07
  18. Alternative 00:04:24
  19. Blues 00:04:30
  20. Rock 00:04:43
  21. Jazz 00:04:51
  22. Classical 00:04:53
  23. Heavy Metal 00:04:57
  24. Electronica/Dance 00:05:02
  25. Metal 00:05:09
  26. Comedy 00:26:25
  27. TV Shows 00:35:45
  28. Drama 00:42:55
  29. Science Fiction 00:43:45
  30. Sci Fi & Fantasy 00:48:31
  31. Name: Milliseconds, dtype: timedelta64[ns]
  1. # 找到每名顾客花费的总时长
  2. In[113]: cust = pd.read_sql_table('customers', engine,
  3. columns=['CustomerId', 'FirstName', 'LastName'])
  4. invoice = pd.read_sql_table('invoices', engine,
  5. columns=['InvoiceId','CustomerId'])
  6. ii = pd.read_sql_table('invoice_items', engine,
  7. columns=['InvoiceId', 'UnitPrice', 'Quantity'])
  8. In[114]: cust_inv = cust.merge(invoice, on='CustomerId') \
  9. .merge(ii, on='InvoiceId')
  10. cust_inv.head()
  11. Out[114]:

5. 连接SQL数据库 - 图4

  1. # 现在可以用总量乘以单位价格,找到每名顾客的总消费
  2. In[115]: total = cust_inv['Quantity'] * cust_inv['UnitPrice']
  3. cols = ['CustomerId', 'FirstName', 'LastName']
  4. cust_inv.assign(Total = total).groupby(cols)['Total'] \
  5. .sum() \
  6. .sort_values(ascending=False).head()
  7. Out[115]:

5. 连接SQL数据库 - 图5

更多

  1. # sql语句查询方法read_sql_query
  2. In[116]: pd.read_sql_query('select * from tracks limit 5', engine)
  3. Out[116]:

5. 连接SQL数据库 - 图6

  1. # 可以将长字符串传给read_sql_query
  2. In[117]: sql_string1 = '''
  3. select
  4. Name,
  5. time(avg(Milliseconds) / 1000, 'unixepoch') as avg_time
  6. from (
  7. select
  8. g.Name,
  9. t.Milliseconds
  10. from
  11. genres as g
  12. join
  13. tracks as t
  14. on
  15. g.genreid == t.genreid
  16. )
  17. group by
  18. Name
  19. order by
  20. avg_time
  21. '''
  22. pd.read_sql_query(sql_string1, engine)
  23. Out[117]:

5. 连接SQL数据库 - 图7

  1. In[118]: sql_string2 = '''
  2. select
  3. c.customerid,
  4. c.FirstName,
  5. c.LastName,
  6. sum(ii.quantity * ii.unitprice) as Total
  7. from
  8. customers as c
  9. join
  10. invoices as i
  11. on c.customerid = i.customerid
  12. join
  13. invoice_items as ii
  14. on i.invoiceid = ii.invoiceid
  15. group by
  16. c.customerid, c.FirstName, c.LastName
  17. order by
  18. Total desc
  19. '''
  20. pd.read_sql_query(sql_string2, engine)
  21. Out[118]:

5. 连接SQL数据库 - 图8