5. 连接SQL数据库
# 在读取chinook数据库之前,需要创建SQLAlchemy引擎
In[108]: from sqlalchemy import create_engine
engine = create_engine('sqlite:///data/chinook.db')
In[109]: tracks = pd.read_sql_table('tracks', engine)
tracks.head()
Out[109]:
# read_sql_table函数可以读取一张表,第一个参数是表名,第二个参数是引擎
In[110]: genres = pd.read_sql_table('genres', engine)
genres.head()
Out[110]:
# 找到每种类型歌曲的平均时长
In[111]: genre_track = genres.merge(tracks[['GenreId', 'Milliseconds']],
on='GenreId', how='left') \
.drop('GenreId', axis='columns')
genre_track.head()
Out[111]:
# 将Milliseconds列转变为timedelta数据类型
In[112]: genre_time = genre_track.groupby('Name')['Milliseconds'].mean()
pd.to_timedelta(genre_time, unit='ms').dt.floor('s').sort_values()
Out[112]:
Name
Rock And Roll 00:02:14
Opera 00:02:54
Hip Hop/Rap 00:02:58
Easy Listening 00:03:09
Bossa Nova 00:03:39
R&B/Soul 00:03:40
World 00:03:44
Pop 00:03:49
Latin 00:03:52
Alternative & Punk 00:03:54
Soundtrack 00:04:04
Reggae 00:04:07
Alternative 00:04:24
Blues 00:04:30
Rock 00:04:43
Jazz 00:04:51
Classical 00:04:53
Heavy Metal 00:04:57
Electronica/Dance 00:05:02
Metal 00:05:09
Comedy 00:26:25
TV Shows 00:35:45
Drama 00:42:55
Science Fiction 00:43:45
Sci Fi & Fantasy 00:48:31
Name: Milliseconds, dtype: timedelta64[ns]
# 找到每名顾客花费的总时长
In[113]: cust = pd.read_sql_table('customers', engine,
columns=['CustomerId', 'FirstName', 'LastName'])
invoice = pd.read_sql_table('invoices', engine,
columns=['InvoiceId','CustomerId'])
ii = pd.read_sql_table('invoice_items', engine,
columns=['InvoiceId', 'UnitPrice', 'Quantity'])
In[114]: cust_inv = cust.merge(invoice, on='CustomerId') \
.merge(ii, on='InvoiceId')
cust_inv.head()
Out[114]:
# 现在可以用总量乘以单位价格,找到每名顾客的总消费
In[115]: total = cust_inv['Quantity'] * cust_inv['UnitPrice']
cols = ['CustomerId', 'FirstName', 'LastName']
cust_inv.assign(Total = total).groupby(cols)['Total'] \
.sum() \
.sort_values(ascending=False).head()
Out[115]:
更多
# sql语句查询方法read_sql_query
In[116]: pd.read_sql_query('select * from tracks limit 5', engine)
Out[116]:
# 可以将长字符串传给read_sql_query
In[117]: sql_string1 = '''
select
Name,
time(avg(Milliseconds) / 1000, 'unixepoch') as avg_time
from (
select
g.Name,
t.Milliseconds
from
genres as g
join
tracks as t
on
g.genreid == t.genreid
)
group by
Name
order by
avg_time
'''
pd.read_sql_query(sql_string1, engine)
Out[117]:
In[118]: sql_string2 = '''
select
c.customerid,
c.FirstName,
c.LastName,
sum(ii.quantity * ii.unitprice) as Total
from
customers as c
join
invoices as i
on c.customerid = i.customerid
join
invoice_items as ii
on i.invoiceid = ii.invoiceid
group by
c.customerid, c.FirstName, c.LastName
order by
Total desc
'''
pd.read_sql_query(sql_string2, engine)
Out[118]: