-
Notifications
You must be signed in to change notification settings - Fork 3
MySQL Histograms
Primal Pappachan edited this page Jun 24, 2020
·
1 revision
Set histogram sampling rate by setting the memory limit (100 MB in example)
Set histogram_generation_max_mem_size = 100000000;
Generate histogram for a column
analyze table `MALL_OBSERVATION` update histogram on shop_name with 50 buckets;
drop histogram
analyze table `MALL_OBSERVATION` drop histogram on shop_name;
To check all the histograms generated
SELECT TABLE_NAME, COLUMN_NAME, HISTOGRAM->>'$."number-of-buckets-specified"' AS num_buckets_specified, HISTOGRAM->>'$."histogram-type"' AS histogram_type, JSON_LENGTH(HISTOGRAM, '$.buckets') AS num_buckets_created FROM INFORMATION_SCHEMA.COLUMN_STATISTICS WHERE SCHEMA_NAME = "tinytest";
Convert equiheight histogram buckets for timestamp to SQL rows
SELECT lvalue, uvalue, concat(round(c*100,1),'%') cumulfreq, CONCAT(round((c - LAG(c, 1, 0) over()) * 100,1), '%') freq, numItems FROM information_schema.column_statistics, JSON_TABLE(histogram->'$.buckets', '$[*]' COLUMNS(lvalue VARCHAR(60) PATH '$[0]', uvalue VARCHAR(60) PATH '$[1]', c double PATH '$[2]', numItems integer PATH '$[3]')) hist where column_name = 'timeStamp';
Convert equiheight histogram buckets for String to SQL rows
SELECT FROM_BASE64(SUBSTRING_INDEX(v1, ':', -1)) lvalue, FROM_BASE64(SUBSTRING_INDEX(v2, ':', -1)) uvalue, concat(round(c*100,1),'%') cumulfreq, CONCAT(round((c - LAG(c, 1, 0) over()) * 100,1), '%') freq, numItems FROM information_schema.column_statistics, JSON_TABLE(histogram->'$.buckets', '$[*]' COLUMNS(v1 VARCHAR(60) PATH '$[0]', v2 VARCHAR(60) PATH '$[1]', c double PATH '$[2]', numItems integer PATH '$[3]')) hist where column_name = 'user_id';
Convert singleton histogram buckets for String to SQL rows
SELECT FROM_BASE64(SUBSTRING_INDEX(v, ':', -1)) value, concat(round(c*100,1),'%') cumulfreq, CONCAT(round((c - LAG(c, 1, 0) over()) * 100,1), '%') freq FROM information_schema.column_statistics, JSON_TABLE(histogram->'$.buckets', '$[*]' COLUMNS(v VARCHAR(60) PATH '$[0]', c double PATH '$[1]')) hist where column_name = 'user_interest';
Note
If with the above queries, value turn out to be null then remove BASE64 conversion.
SELECT v as value, concat(round(c*100,1),'%') cumulfreq, CONCAT(round((c - LAG(c, 1, 0) over()) * 100,1), '%') freq FROM information_schema.column_statistics, JSON_TABLE(histogram->'$.buckets', '$[*]' COLUMNS(v VARCHAR(60) PATH '$[0]', c double PATH '$[1]')) hist where column_name = 'user_interest';
Convert equiheight histogram buckets for Integer to SQL rows
SELECT lvalue, uvalue, concat(round(c*100,1),'%') cumulfreq, CONCAT(round((c - LAG(c, 1, 0) over()) * 100,1), '%') freq, numItems FROM information_schema.column_statistics, JSON_TABLE(histogram->'$.buckets', '$[*]' COLUMNS(lvalue int PATH '$[0]', uvalue int PATH '$[1]', c double PATH '$[2]', numItems integer PATH '$[3]')) hist where column_name = 'user_id';
Convert singleton histogram buckets for Date to SQL rows
SELECT value, concat(round(c*100,1),'%') cumulfreq, CONCAT(round((c - LAG(c, 1, 0) over()) * 100,1), '%') freq, numItems FROM information_schema.column_statistics, JSON_TABLE(histogram->'$.buckets', '$[*]' COLUMNS(value date PATH '$[0]', c double PATH '$[1]', numItems integer PATH '$[2]')) hist where column_name = 'start_date';
Convert equiheight histograms for Time to SQL rows
SELECT lvalue, uvalue, concat(round(c*100,1),'%') cumulfreq, CONCAT(round((c - LAG(c, 1, 0) over()) * 100,1), '%') freq, numItems FROM information_schema.column_statistics, JSON_TABLE(histogram->'$.buckets', '$[*]' COLUMNS(lvalue time PATH '$[0]', uvalue time PATH '$[1]', c double PATH '$[2]', numItems integer PATH '$[3]')) hist where column_name = 'start_time';
References