I have a query that help me list all index in a database and works pretty well:
x
SELECT TableName = t.name,
IndexName = ind.name,
IndexId = ind.index_id,
ColumnId = ic.index_column_id,
ColumnName = col.name,
--(SELECT SUM(CASE WHEN col.name IS NULL THEN 1 ELSE 0 END) * 100.0 / count(*) FROM t.name) as nulls_percent,
ind.*,
ic.*,
col.*
FROM sys.indexes ind
INNER JOIN sys.index_columns ic ON ind.object_id = ic.object_id
AND ind.index_id = ic.index_id
INNER JOIN sys.columns col ON ic.object_id = col.object_id
AND ic.column_id = col.column_id
INNER JOIN sys.tables t ON ind.object_id = t.object_id
WHERE ind.is_primary_key = 0
AND ind.is_unique = 0
AND ind.is_unique_constraint = 0
AND t.is_ms_shipped = 0
ORDER BY t.name,
ind.name,
ind.index_id,
ic.is_included_column,
ic.key_ordinal;
Unfortunately if I uncomment the 6th line the word t.name
is underlined in red and if I run the query I receive the error:
Invalid object name 'TableName'.
How can I make this subquery work?
The goal is to have a percentage of NULLs
on each column
Advertisement
Answer
Your question and query are quite interesting and can be resolved with the help of statistics which always exist for columns participating in a index as is the case of the columns your are selecting in your query.
For accurate results, it is beneficial to update the statistics before running the query I’m providing below.
;WITH cteColumnAllStats AS
(
SELECT
ST_COL.object_id,
ST_COL.column_id,
ST_COL.stats_id,
-- NOTE: order no among stats of the same column
ROW_NUMBER()
OVER(
PARTITION BY
ST_COL.object_id,
ST_COL.column_id
ORDER BY
ST_COL.stats_id
) AS StatsOrderNo
FROM sys.stats ST
INNER JOIN sys.stats_columns ST_COL
ON ST_COL.stats_id = ST.stats_id
AND ST_COL.object_id = ST.object_id
)
,cteColumnFirstStats AS
(
SELECT
ST_COL.object_id,
ST_COL.column_id,
-- NOTES:
-- =====
-- this would be null if there were no statistics for the column
-- however not in this case because we are only considering columns
-- participating in an index and all indices have statistics behind
-- the scenes.
--
-- Also consider whether the statistics have been updated:
-- If they have, the result will be a whole number (without decimals)
-- and the result is exact.
-- If they have not, the result is an estimate and in most of the cases
-- there will be decimals or even produce a negative result.
--
-- If you want accurate results, you need to update the statistics:
-- EXEC sp_updatestats
--
SUM(ST_HIST.range_rows) + SUM(ST_HIST.equal_rows) AS NonNullsRowCount
FROM cteColumnAllStats ST_COL
-- NOTE: this is the important bit
CROSS APPLY sys.dm_db_stats_histogram(
ST_COL.object_id,
ST_COL.stats_id
) ST_HIST
WHERE ST_COL.StatsOrderNo = 1 -- take only the first stats for the column
GROUP BY
ST_COL.object_id,
ST_COL.column_id
)
SELECT TableName = t.name,
IndexName = ind.name,
IndexId = ind.index_id,
ColumnId = ic.index_column_id,
ColumnName = col.name,
-- NOTE: included these columns for reference purposes (PLEASE REMOVE)
SIND.rowcnt AS [RowCount],
ST_COL.NonNullsRowCount,
SIND.rowcnt - ST_COL.NonNullsRowCount AS NullsRowCount,
--(SELECT SUM(CASE WHEN col.name IS NULL THEN 1 ELSE 0 END) * 100.0 / count(*) FROM t.name) as nulls_percent,
CASE
-- NOTE: stats are definitely out of date
WHEN SIND.rowcnt < ST_COL.NonNullsRowCount THEN NULL
-- NOTE: stats could be out of date (good to update them first)
-- Also we don't want a divide by 0 hence the NULLIF
ELSE (SIND.rowcnt - ST_COL.NonNullsRowCount) * 100.0 / NULLIF(SIND.rowcnt, 0)
END as nulls_percent,
ind.*,
ic.*,
col.*
FROM sys.indexes ind
INNER JOIN sys.index_columns ic ON ind.object_id = ic.object_id
AND ind.index_id = ic.index_id
INNER JOIN sys.columns col ON ic.object_id = col.object_id
AND ic.column_id = col.column_id
INNER JOIN sys.tables t ON ind.object_id = t.object_id
-- NOTE: this gives you the COUNT(*) without querying the table
INNER JOIN sys.sysindexes SIND
ON SIND.id = t.object_id
-- NOTE:
-- 0 means Heap
-- 1 means Clustered Index
-- Only these are reliable to use their rowcnt.
-- There's always 1 of these and not the other.
AND SIND.indid < 2
-- NOTE: inner join is OK here because all columns participating in a index
-- have associated statistics
INNER JOIN cteColumnFirstStats ST_COL
ON ST_COL.object_id = t.object_id
AND ST_COL.column_id = col.column_id
WHERE ind.is_primary_key = 0
AND ind.is_unique = 0
AND ind.is_unique_constraint = 0
AND t.is_ms_shipped = 0
ORDER BY t.name,
ind.name,
ind.index_id,
ic.is_included_column,
ic.key_ordinal;