Skip to content

Select with limit does not properly treat duplicates in columnshard #35914

@risenberg

Description

@risenberg

Setup tables

CREATE TABLE test (str String NOT NULL, PRIMARY KEY(str)) WITH ( STORE = COLUMN, AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 9 );
CREATE TABLE test2 (str String NOT NULL, PRIMARY KEY(str)) WITH ( STORE = COLUMN, AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 9 );

ALTER OBJECT `/Root/testdb/test` (TYPE TABLE) SET (ACTION = UPSERT_OPTIONS, `COMPACTION_PLANNER.CLASS_NAME` = 'lc-buckets', `COMPACTION_PLANNER.FEATURES` = `{\"levels\":[{\"portions_live_duration\":\"30.000000s\",\"class_name\":\"Zero\",\"expected_blobs_size\":131072,\"portions_count_limit\":20000000},{\"class_name\":\"Zero\",\"expected_blobs_size\":131072,\"portions_count_limit\":20000000}]}`);

Fill with duplicate values

import ydb
import random

random.seed(100500)

def get_column_types():
    column_types = ydb.BulkUpsertColumns()
    column_types.add_column("str", ydb.PrimitiveType.String)

    return column_types

with ydb.Driver(endpoint="<xxx>", database="<yyy>") as driver:
    driver.wait(timeout=30, fail_fast=True)
    
    long_str = "abcdef" * 50000
    for i in range(1000):      
      rows = []
      for j in range(100):
        rows.append({"str": bytes(long_str, "utf8")})
        rows.append({"str": bytes(str(random.randint(0, 1000000)), "utf8")})
      
      driver.table_client.bulk_upsert(table_path=f"<yyy>/test", rows=rows, column_types=get_column_types())
      print(f"inserted {i}")
upsert into test2 select * from test limit 10000;
select count(*) from test2;
<result less than 10000>

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions