Skip to content

Commit 1530b8e

Browse files
chore(isthmus): alternate forms for tpc-ds queries (#513)
Alternate forms for TPC-DS queries 27, 36, 70 and 86. These forms rewrite queries that use the GROUPING aggregate function, which does not have a direct Substrait equivalent. These test cases now pass. Signed-off-by: Mark S. Lewis <[email protected]>
1 parent 260a1c4 commit 1530b8e

File tree

5 files changed

+153
-2
lines changed

5 files changed

+153
-2
lines changed

isthmus/src/test/java/io/substrait/isthmus/TpcdsQueryTest.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313

1414
/** TPC-DS test to convert SQL to Substrait and then convert those plans back to SQL. */
1515
public class TpcdsQueryTest extends PlanTestBase {
16-
private static final Set<Integer> toSubstraitExclusions = Set.of(9, 27, 36, 70, 86);
16+
private static final Set<Integer> alternateForms = Set.of(27, 36, 70, 86);
17+
private static final Set<Integer> toSubstraitExclusions = Set.of(9);
1718
private static final Set<Integer> fromSubstraitPojoExclusions = Set.of(1, 30, 81);
1819
private static final Set<Integer> fromSubstraitProtoExclusions = Set.of(1, 30, 67, 81);
1920

@@ -28,7 +29,7 @@ static IntStream testCases() {
2829
@ParameterizedTest
2930
@MethodSource("testCases")
3031
public void testQuery(int query) throws IOException {
31-
String inputSql = asString(String.format("tpcds/queries/%02d.sql", query));
32+
String inputSql = asString(inputSqlFile(query));
3233

3334
Plan plan = assertDoesNotThrow(() -> toSubstraitPlan(inputSql), "SQL to Substrait POJO");
3435

@@ -48,6 +49,14 @@ public void testQuery(int query) throws IOException {
4849
}
4950
}
5051

52+
private String inputSqlFile(int query) {
53+
if (alternateForms.contains(query)) {
54+
return String.format("tpcds/queries/%02da.sql", query);
55+
}
56+
57+
return String.format("tpcds/queries/%02d.sql", query);
58+
}
59+
5160
private Plan toSubstraitPlan(String sql) throws SqlParseException {
5261
return toSubstraitPlan(sql, TPCDS_CATALOG);
5362
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
with results as
2+
(select i_item_id,
3+
s_state, 0 as g_state,
4+
ss_quantity agg1,
5+
ss_list_price agg2,
6+
ss_coupon_amt agg3,
7+
ss_sales_price agg4
8+
from store_sales, customer_demographics, date_dim, store, item
9+
where ss_sold_date_sk = d_date_sk and
10+
ss_item_sk = i_item_sk and
11+
ss_store_sk = s_store_sk and
12+
ss_cdemo_sk = cd_demo_sk and
13+
cd_gender = 'dist(gender, 1, 1)' and
14+
cd_marital_status = 'dist(marital_status, 1, 1)' and
15+
cd_education_status = 'dist(education, 1, 1)' and
16+
d_year = 1998 and
17+
s_state in ('TN', 'TN', 'TN', 'TN', 'TN', 'TN')
18+
)
19+
20+
select i_item_id, s_state, g_state, agg1, agg2, agg3, agg4
21+
from (
22+
select i_item_id, s_state, 0 as g_state, avg(agg1) agg1, avg(agg2) agg2, avg(agg3) agg3, avg(agg4) agg4 from results
23+
group by i_item_id, s_state
24+
union all
25+
select i_item_id, NULL AS s_state, 1 AS g_state, avg(agg1) agg1, avg(agg2) agg2, avg(agg3) agg3,
26+
avg(agg4) agg4 from results
27+
group by i_item_id
28+
union all
29+
select NULL AS i_item_id, NULL as s_state, 1 as g_state, avg(agg1) agg1, avg(agg2) agg2, avg(agg3) agg3,
30+
avg(agg4) agg4 from results
31+
) foo
32+
order by i_item_id, s_state
33+
LIMIT 100
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
with results as
2+
(select
3+
sum(ss_net_profit) as ss_net_profit, sum(ss_ext_sales_price) as ss_ext_sales_price,
4+
sum(ss_net_profit)/sum(ss_ext_sales_price) as gross_margin
5+
,i_category
6+
,i_class
7+
,0 as g_category, 0 as g_class
8+
from
9+
store_sales
10+
,date_dim d1
11+
,item
12+
,store
13+
where
14+
d1.d_year = 1998
15+
and d1.d_date_sk = ss_sold_date_sk
16+
and i_item_sk = ss_item_sk
17+
and s_store_sk = ss_store_sk
18+
and s_state in ('TN','TN','TN','TN', 'TN','TN','TN','TN')
19+
group by i_category,i_class)
20+
,
21+
results_rollup as
22+
(select gross_margin ,i_category ,i_class,0 as t_category, 0 as t_class, 0 as lochierarchy from results
23+
union
24+
select sum(ss_net_profit)/sum(ss_ext_sales_price) as gross_margin,
25+
i_category, NULL AS i_class, 0 as t_category, 1 as t_class, 1 as lochierarchy from results group by i_category
26+
union
27+
select sum(ss_net_profit)/sum(ss_ext_sales_price) as gross_margin,
28+
NULL AS i_category ,NULL AS i_class, 1 as t_category, 1 as t_class, 2 as lochierarchy from results)
29+
select
30+
gross_margin ,i_category ,i_class, lochierarchy,rank() over (
31+
partition by lochierarchy, case when t_class = 0 then i_category end
32+
order by gross_margin asc) as rank_within_parent
33+
from results_rollup
34+
order by
35+
lochierarchy desc
36+
,case when lochierarchy = 0 then i_category end
37+
,rank_within_parent
38+
LIMIT 100
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
with results as
2+
( select
3+
sum(ss_net_profit) as total_sum ,s_state ,s_county, 0 as gstate, 0 as g_county
4+
from
5+
store_sales
6+
,date_dim d1
7+
,store
8+
where
9+
d1.d_month_seq between 1220 and 1220 + 11
10+
and d1.d_date_sk = ss_sold_date_sk
11+
and s_store_sk = ss_store_sk
12+
and s_state in
13+
( select s_state
14+
from (select s_state as s_state,
15+
rank() over ( partition by s_state order by sum(ss_net_profit) desc) as ranking
16+
from store_sales, store, date_dim
17+
where d_month_seq between 1220 and 1220 + 11
18+
and d_date_sk = ss_sold_date_sk
19+
and s_store_sk = ss_store_sk
20+
group by s_state
21+
) tmp1
22+
where ranking <= 5)
23+
group by s_state,s_county) ,
24+
results_rollup as
25+
(select total_sum ,s_state ,s_county, 0 as g_state, 0 as g_county, 0 as lochierarchy from results
26+
union
27+
select sum(total_sum) as total_sum,s_state, NULL as s_county, 0 as g_state, 1 as g_county, 1 as lochierarchy from results group by s_state
28+
union
29+
select sum(total_sum) as total_sum ,NULL as s_state ,NULL as s_county, 1 as g_state, 1 as g_county, 2 as lochierarchy from results)
30+
select total_sum ,s_state ,s_county, lochierarchy
31+
,rank() over (
32+
partition by lochierarchy,
33+
case when g_county = 0 then s_state end
34+
order by total_sum desc) as rank_within_parent
35+
from results_rollup
36+
order by
37+
lochierarchy desc
38+
,case when lochierarchy = 0 then s_state end
39+
,rank_within_parent
40+
LIMIT 100
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
with results as
2+
( select sum(ws_net_paid) as total_sum, i_category, i_class, 0 as g_category, 0 as g_class
3+
from
4+
web_sales
5+
,date_dim d1
6+
,item
7+
where
8+
d1.d_month_seq between 1220 and 1220+11
9+
and d1.d_date_sk = ws_sold_date_sk
10+
and i_item_sk = ws_item_sk
11+
group by i_category,i_class
12+
) ,
13+
14+
results_rollup as
15+
( select total_sum ,i_category ,i_class, g_category, g_class, 0 as lochierarchy from results
16+
union
17+
select sum(total_sum) as total_sum, i_category, NULL as i_class, 0 as g_category, 1 as g_class, 1 as lochierarchy from results group by i_category
18+
union
19+
select sum(total_sum) as total_sum, NULL as i_category, NULL as i_class, 1 as g_category, 1 as g_class, 2 as lochierarchy from results)
20+
select total_sum ,i_category ,i_class, lochierarchy
21+
,rank() over (
22+
partition by lochierarchy,
23+
case when g_class = 0 then i_category end
24+
order by total_sum desc) as rank_within_parent
25+
from
26+
results_rollup
27+
order by
28+
lochierarchy desc,
29+
case when lochierarchy = 0 then i_category end,
30+
rank_within_parent
31+
LIMIT 100

0 commit comments

Comments
 (0)