# @title Run this cell to download files and setup environment.
import duckdb
import pandas as pd
'display.max_rows', 1000)
pd.set_option(import numpy as np
import random
from tqdm.auto import tqdm
import os
import timeit
import ipywidgets as widgets
from IPython.display import display
#download parquet files for `transaction_tbl`
!wget https://github.com/cstorm125/cstorm125.github.io/releases/download/transaction_tbl/transaction_tbl.zip > /dev/null 2>&1
!unzip transaction_tbl.zip > /dev/null 2>&1
#create table in duckdb
= duckdb.connect()
con
#short-hand function to execute query with duckdb easily
def execute_query(query): return con.execute(query).fetchdf()
= f"""
query CREATE OR REPLACE TABLE transaction_tbl AS
SELECT *
FROM read_parquet('transaction_tbl/*/*/*.parquet', hive_partitioning=true);
"""
#load saved, partitioned table as transaction_tbl
execute_query(query)
(Almost) All SQL You Need Exercise: Personalized Recommendation
In this exercise, we will use a subset of the Taobao User Behavior (5M rows) to build and evaluate a rule-based recommendation system for users based on their past behaviors. We will โtrainโ based on data before 2017-12 and test on data of 2017-12, optimizing for top-10 hit rate based on purchases. We will ask you to write an appropriate query for each task required. Give it your best shot but reveal the answer keys if needed.
To complete the exercise, first run each cell individually (Run all
unfortunately does not work) then fill in the query and click Execute query
. Use Reveal hint
and Reveal answer
as needed. Refer to the blog post for SQL techniques you need to complete the exercise and duckdb documentation for the specific quirks of duckdb SQL.
Sanity Checks
After downloading and creating transaction_tbl
in our database, let us first do some sanity checks.
# @title Query to randomly see 100 rows
= f"""
answer_key select * from transaction_tbl limit 100;
"""
= 'select * and limit'
hint
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
# @title Count how many events, how many unique customers, how many unique items, how many unique event types.
= f"""
answer_key select
count(*) as nb_event
,count(distinct user_id) as nb_user
,count(distinct item_id) as nb_item
,count(distinct behavior_type) as nb_event_type
from transaction_tbl;
"""
= 'count and count distinct'
hint
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
# @title How many events and unique users per event type?
= f"""
answer_key select
behavior_type
,count(*) as nb_event
,count(distinct user_id) as nb_user
from transaction_tbl
group by 1;
"""
= 'group by'
hint
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
# @title Convert integer timestamp into human-readable timestamp `event_timestamp` and date `event_date`, then save as `transaction_tbl_x` view.
= f"""
answer_key create or replace view transaction_tbl_x as (
select
a.*
,to_timestamp(a.timestamp) as event_timestamp
,strftime(to_timestamp(a.timestamp), '%Y-%m-%d') as event_date
,substring(cast(to_timestamp(a.timestamp)as varchar),1,7) as year_month --optional
from transaction_tbl a
);
select * from transaction_tbl_x limit 10;
"""
= 'to_timestamp and date_trunc'
hint
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
# @title Check distribution of `event_date`. Is there anything weird?
= f"""
answer_key select
event_date
,count(*) as nb_event
from transaction_tbl_x
group by 1
order by 1;
"""
= 'Do not forget we are using `transaction_tbl_x` now. Order by `event_date` to see if there is anything odd.'
hint
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
It is quite clear from exploration that most of the events come from 2017-11-25
to 2017-12-03
, so let us use 2017-11-25
to 2017-11-30
as training set and 2017-12-01
to 2017-12-03
as test set. If this were your production database, having events from the 1920s mixed in there should raise a flag but we will ignore them in this educational setting.
# @title Create `train_tbl` with events from `2017-11-25` to `2017-11-30` and `test_tbl` with events from `2017-12-01` to `2017-12-03`
= f"""
answer_key create or replace view train_tbl as (
select
*
from transaction_tbl_x
where event_date between '2017-11-25' and '2017-11-30'
);
create or replace view test_tbl as (
select
*
from transaction_tbl_x
where event_date between '2017-12-01' and '2017-12-03'
);
select 'train' split, count(*) nb_event, count(distinct user_id) nb_user from train_tbl
union all
select 'test' split, count(*) nb_event, count(distinct user_id) nb_user from test_tbl;
"""
= 'where-clause and `between`'
hint
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
Success Measurement: Metric and Baseline
Our task is to recommend 10 items that each user will most likely purchase during the test set period. The only information we have is what they viewed (pv
), added to cart (cart
) or favorited (fav
) during the training set period.
Metric
Our metric is top-10 hit rate based on purchases. This means that for each customer, we will recommend 10 items and if the user purchased any of those items during the test set period, the user is marked as 1 else as 0. We then take an average across all users to get our score.
Example: top-3 hit rate calculation
user_id | pred | is_purchased_in_test_tbl |
---|---|---|
A | X | 0 |
A | Y | 0 |
A | Z | 1 |
B | W | 0 |
B | T | 0 |
B | R | 0 |
C | Q | 0 |
C | T | 1 |
C | X | 1 |
user_id | hit | hit_flag |
---|---|---|
A | 1 | 1 |
B | 0 | 0 |
C | 2 | 1 |
top-3 hit rate = 1+0+1 / 3 = 66%
Baseline
The most simple baseline we can compare our recommendation system with is recommending top-10 best-selling items to all users
.
Let us try to measure the performance of this simple baseline.
# @title Create view `eval_tbl` containing `user_id`, `item_id` only for items the users in `test_tbl` purchased (`buy`)
= f"""
answer_key create or replace view eval_tbl as (
select
user_id
,item_id
from test_tbl
where behavior_type = 'buy'
group by 1,2
);
select * from eval_tbl;
"""
= 'group by and where'
hint
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
# @title Get top 10 most-interacted items in training set
= f"""
answer_key select
item_id
,count(*) as nb_event
from train_tbl
group by 1
order by 2 desc
limit 10;
"""
= 'group by and limit'
hint
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
# @title Create view `pred_top10_tbl` containing all `user_id` from `test_tbl`. Each `user_id` should have 10 rows each row with the 10 recommendations, in this case top-10 most interacted items (same for all `user_id`).
= f"""
answer_key create or replace view pred_top10_tbl as (
with test_user_tbl as (
select
user_id
,max(1) as joining_column
from test_tbl
group by 1),
top10_tbl as (
select
item_id
,count(*) as nb_buy
,max(1) as joining_column
from train_tbl
group by 1
order by 2 desc
limit 10)
select
a.user_id
,b.item_id
from test_user_tbl a
inner join top10_tbl b
on a.joining_column = b.joining_column
);
select count(*) nb_pred, count(distinct user_id) nb_user from pred_top10_tbl;
"""
= '''
hint You need to create a new `joining_column` that is all the same values max(1) to `inner join` on.
Or use `cross join`: https://www.geeksforgeeks.org/sql/sql-cross-join/
count(distinct user_id) and count(*) again to make sure that each user has 10 recommendations.
'''
#input
= widgets.Textarea(
text_area ="Write your query here. This is a tough one so you might want to use the hint.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
# @title With `pred_top10_tbl` and `eval_tbl`, calculate top-10 hit rate
= f"""
answer_key select
avg(case when nb_hit > 0 then 1 else 0 end) as top_10_hit_rate
from
(select
a.user_id
,sum(case when b.item_id is not null then 1 else 0 end) as nb_hit
from eval_tbl a
left join pred_top10_tbl b
on a.user_id = b.user_id
and a.item_id = b.item_id
group by 1) c;
"""
= f"""
hint Remember that we average 1/0s across users, not items.
"""
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
Considering we have 1,253,465 items in total, it is not surprising that our top-10 hit rate is only 0.0923% of all users in the test set. Let us see if we can do better.
Recommend what they have interacted with but have yet bought
One of the most powerful recommendation algorithm is to recommend what users have interacted with the most. In fact, this is what you would see in worldโs best online retail sites, especially those dealing with groceries, beauty, and health and personal care. We add a small twist by only ordering the top-10 recommendation for each customer by all interactions (pv
, cart
, fav
) EXCEPT buy
. This is because if a user has bought the item before, it might not be likely for them to buy again in such as short period of time between training set and test set.
# @title Create view `pred_mfp_tbl` containing all `user_id` from `train_tbl`. Each user should have 10 recommendations based on the top-10 items, ordered by the number of times they have interacted (`pv`, `cart`, `fav`) with, but have yet bought (`buy`).
= f"""
answer_key create or replace view pred_mfp_tbl as (
select * from
(select
user_id
,item_id
,row_number() over (partition by user_id order by nb_nonbuy_event desc) as rnk
from
(select
user_id
,item_id
,count(*) as nb_nonbuy_event
from train_tbl
where behavior_type <> 'buy'
group by 1,2) a
) b
where rnk <=10);
select
nb_pred
,count(*) as nb_user
from
(select
user_id
,count(*) as nb_pred
from pred_mfp_tbl
group by 1) a
group by 1 order by 2 desc;
"""
= f"""
hint We need window function to make sure we have at most 10 items per customer.
"""
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
# @title Check if we have recommendation in `pred_mfp_tbl` for every user in `test_tbl`?
= f"""
answer_key select
avg(case when b.user_id is not null then 1 else 0 end) as has_pred
from (select distinct user_id from test_tbl) a
left join (select distinct user_id from pred_mfp_tbl) b
on a.user_id = b.user_id;
"""
= f"""
hint left join then count the nulls
"""
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
# @title For users who we do not have recommendations for in `pred_mfp_tbl`, give them recommendations from `pred_top10_tbl`. Create a new view for this called `pred_final_tbl`.
= f"""
answer_key create or replace view pred_top10_tbl_missing_only as (
select
a.user_id
,a.item_id
from pred_top10_tbl a
left join pred_mfp_tbl b
on a.user_id = b.user_id
where b.user_id is null);
create or replace view pred_final_tbl as (
select user_id,item_id from pred_mfp_tbl
union all
select user_id,item_id from pred_top10_tbl_missing_only);
select 'pred' split, count(*) nb_pred, count(distinct user_id) nb_user from pred_final_tbl
union all
select 'test' split, count(*) nb_pred, count(distinct user_id) nb_user from test_tbl;
"""
= f"""
hint One way you can do this is:
1) Find out who did not get recommendation in `pred_mfp_tbl`
2) Filter `pred_top10_tbl` to have only those users
3) Concatenate `pred_top10_tbl` containing the missing users and `pred_mfp_tbl` to create view `pred_final_tbl`. Make sure columns are the same when you concatenate.
"""
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
# @title Calculate top-10 hit rate for `pred_final_tbl` and see if it is better than `pred_top10_tbl`
= f"""
answer_key select
'pred_top10_tbl' model
,avg(case when nb_hit > 0 then 1 else 0 end) as top_10_hit_rate
from
(select
a.user_id
,sum(case when b.item_id is not null then 1 else 0 end) as nb_hit
from eval_tbl a
left join pred_top10_tbl b
on a.user_id = b.user_id
and a.item_id = b.item_id
group by 1) c
union all
select
'pred_final_tbl' model
,avg(case when nb_hit > 0 then 1 else 0 end) as top_10_hit_rate
from
(select
a.user_id
,sum(case when b.item_id is not null then 1 else 0 end) as nb_hit
from eval_tbl a
left join pred_final_tbl b
on a.user_id = b.user_id
and a.item_id = b.item_id
group by 1) c;
"""
= f"""
hint Do the same thing as what you did for `pred_top10_tbl` then concatenate the result together.
"""
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)
๐ Congratulations! You have made 21x improvement on the baseline to get almost 2% top-10 hit rate. This marks the end of this exercise but if you want to further improve our algorithm, feel free to do so with the console below.
# @title Run anything
= f"""
answer_key There is no correcrt answer in life.
"""
= f"""
hint You just need to mess around and find out.
"""
#input
= widgets.Textarea(
text_area ="Write your query here.",
value=10, # Initial number of visible rows
rows="Query:",
description={'width': '730px'} # Adjust width as needed
layout
)
#output
= query_result = widgets.Output(
query_result =widgets.Layout(
layout='1px solid lightgray', # Add a border to make the scrollable area visible
border='300px', # Fixed height for the output area
height='scroll' # Enable vertical scrolling
overflow_y
)
)
#button
= widgets.Button(
execute_button ='Execute query',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def process_query(b):
= text_area.value
query with query_result:
query_result.clear_output()
display(execute_query(query))
execute_button.on_click(process_query)
= widgets.Button(
answer_button ='Reveal answer key',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_answer_key(b):
= text_area.value
query with query_result:
query_result.clear_output()print(f'Answer key is: \n{answer_key}')
answer_button.on_click(show_answer_key)
= widgets.Button(
hint_button ='Reveal hint',
description=False,
disabled='', # 'success', 'info', 'warning', 'danger' or ''
button_style='check' # (FontAwesome icons available)
icon
)def show_hint(b):
with query_result:
query_result.clear_output()print(f'Hint: \n{hint}')
hint_button.on_click(show_hint)
display(text_area, execute_button, hint_button, answer_button, query_result)