Microsoft 365: Duplicate files
Updated:Summary
This report scans for truly duplicated files in Microsoft 365. The content hash collected from each file allows duplicates to be found and remediated.
Details
Duplicated files can be redundant — taking up extra space — or they can be a security risk. Files downloaded from SharePoint, where access is more tightly regulated, to a user’s OneDrive for Business and then re-shared can become a significant exposure.
This report finds duplicated files even if they have been renamed. They may be spread across SharePoint document libraries, Teams drives, or OneDrive drives.
To limit which drive categories the report covers (SharePoint, Teams, or OneDrive), edit the WHERE clause on line 65.
Code
SQL Server
WITH paths AS (
SELECT
di.name,
di.size,
di.created_by,
di.modified_by,
COUNT(*) OVER (PARTITION BY di.file_hash) AS total_hash_count,
di.item_type,
RIGHT(pp.web_url, LEN(pp.web_url) - LEN(d.web_url)) AS parent_path,
d.web_url AS drive_path,
srs.bytes_to_hex_string(di.file_hash) as file_hash,
CASE
WHEN udm.id IS NOT NULL THEN 'OneDrive'
WHEN gdm.id IS NOT NULL THEN 'Teams'
ELSE 'SharePoint'
END AS drive_category
FROM ms365.drive_items AS di
JOIN ms365.drives AS d ON d.ms365_id = di.ms365_drive_id
JOIN ms365.drive_scans AS ds ON ds.drive_id = d.id AND ds.id = di.scan_id
LEFT JOIN ms365.drive_items AS pp ON pp.ms365_id = di.ms365_parent_id AND di.scan_id = pp.scan_id
LEFT JOIN ms365.group_drives AS gdm ON gdm.ms365_drive_id = d.ms365_id
LEFT JOIN ms365.user_drives AS udm ON udm.ms365_drive_id = d.ms365_id
WHERE 1=1
AND ds.scan_state = 1
AND di.item_type = 1
),
p AS (
SELECT
p.drive_path,
CASE
WHEN LEN(p.parent_path) = 0 THEN '/'
ELSE p.parent_path
END AS parent_path,
p.name AS filename,
p.size as item_size,
SUM(p.size) over (partition by p.file_hash) as total_hash_size,
srs.byte_string(p.size) AS total_item_size_string,
srs.byte_string(CAST(SUM(p.size) over (partition by p.file_hash) as bigint)) as total_hash_size_string,
(SUM(p.size) over (partition by p.file_hash) - p.size) as wasted_size,
srs.byte_string(CAST((SUM(p.size) over (partition by p.file_hash) - p.size) as bigint)) as wasted_size_string,
ROW_NUMBER() OVER (partition by p.file_hash order by p.size) AS hash_rownum,
CASE WHEN uc.display_name IS NOT NULL THEN uc.display_name ELSE gc.display_name END AS created_by,
CASE WHEN um.display_name IS NOT NULL THEN um.display_name ELSE gm.display_name END AS modified_by,
p.file_hash,
p.total_hash_count,
p.total_hash_count - 1 AS wasted_hash_count,
p.drive_category
FROM
paths AS p
LEFT OUTER JOIN ms365.users AS uc ON uc.ms365_id = p.created_by
LEFT OUTER JOIN ms365.users AS um ON um.ms365_id = p.modified_by
LEFT OUTER JOIN ms365.groups AS gc ON gc.ms365_id = p.created_by
LEFT OUTER JOIN ms365.groups AS gm ON gm.ms365_id = p.modified_by
WHERE 1=1
AND p.file_hash IS NOT NULL
AND p.size > 0
AND p.total_hash_count >= 2
),
x AS (
SELECT
p.*,
SUM(p.item_size) OVER() AS total_size,
DENSE_RANK() OVER (ORDER BY p.wasted_size DESC) AS wasted_size_rank
FROM p
WHERE p.drive_category in ('SharePoint', 'Teams', 'OneDrive')
),
y AS (
SELECT
SUM(p.wasted_size) AS total_wasted_size
FROM p
WHERE p.hash_rownum = 1
)
SELECT
x.*,
y.total_wasted_size
FROM x
CROSS JOIN y
ORDER BY total_hash_count DESC
PostgreSQL
WITH paths AS (
SELECT
di.name,
di.size,
di.created_by,
di.modified_by,
COUNT(*) OVER (PARTITION BY di.file_hash) AS total_hash_count,
di.item_type,
RIGHT(pp.web_url, length(pp.web_url) - length(d.web_url)) AS parent_path,
d.web_url AS drive_path,
srs.bytes_to_hex_string(di.file_hash) as file_hash,
CASE
WHEN udm.id IS NOT NULL THEN 'OneDrive'
WHEN gdm.id IS NOT NULL THEN 'Teams'
ELSE 'SharePoint'
END AS drive_category
FROM ms365.drive_items AS di
JOIN ms365.drives AS d ON d.ms365_id = di.ms365_drive_id
JOIN ms365.drive_scans AS ds ON ds.drive_id = d.id AND ds.id = di.scan_id
LEFT JOIN ms365.drive_items AS pp ON pp.ms365_id = di.ms365_parent_id AND di.scan_id = pp.scan_id
LEFT JOIN ms365.group_drives AS gdm ON gdm.ms365_drive_id = d.ms365_id
LEFT JOIN ms365.user_drives AS udm ON udm.ms365_drive_id = d.ms365_id
WHERE 1=1
AND ds.scan_state = 1
AND di.item_type = 1
),
p AS (
SELECT
p.drive_path,
CASE
WHEN length(p.parent_path) = 0 THEN '/'
ELSE p.parent_path
END AS parent_path,
p.name AS filename,
p.size as item_size,
SUM(p.size) over (partition by p.file_hash) as total_hash_size,
srs.byte_string(p.size) AS total_item_size_string,
srs.byte_string(CAST(SUM(p.size) over (partition by p.file_hash) as bigint)) as total_hash_size_string,
(SUM(p.size) over (partition by p.file_hash) - p.size) as wasted_size,
srs.byte_string(CAST((SUM(p.size) over (partition by p.file_hash) - p.size) as bigint)) as wasted_size_string,
ROW_NUMBER() OVER (partition by p.file_hash order by p.size) AS hash_rownum,
CASE WHEN uc.display_name IS NOT NULL THEN uc.display_name ELSE gc.display_name END AS created_by,
CASE WHEN um.display_name IS NOT NULL THEN um.display_name ELSE gm.display_name END AS modified_by,
p.file_hash,
p.total_hash_count,
p.total_hash_count - 1 AS wasted_hash_count,
p.drive_category
FROM
paths AS p
LEFT OUTER JOIN ms365.users AS uc ON uc.ms365_id = p.created_by
LEFT OUTER JOIN ms365.users AS um ON um.ms365_id = p.modified_by
LEFT OUTER JOIN ms365.groups AS gc ON gc.ms365_id = p.created_by
LEFT OUTER JOIN ms365.groups AS gm ON gm.ms365_id = p.modified_by
WHERE 1=1
AND p.file_hash IS NOT NULL
AND p.size > 0
AND p.total_hash_count >= 2
),
x AS (
SELECT
p.*,
SUM(p.item_size) OVER() AS total_size,
DENSE_RANK() OVER (ORDER BY p.wasted_size DESC) AS wasted_size_rank
FROM p
WHERE p.drive_category in ('SharePoint', 'Teams', 'OneDrive')
),
y AS (
SELECT
SUM(p.wasted_size) AS total_wasted_size
FROM p
WHERE p.hash_rownum = 1
)
SELECT
x.*,
y.total_wasted_size
FROM x
CROSS JOIN y
ORDER BY total_hash_count DESC Downloads
| Attachment | Size |
|---|---|
| MS365-DuplicateFiles.zip | 6.48 KB |
Sample Report
| Attachment | Size |
|---|---|
| MS365-DuplicateFiles.pdf | 266.07 KB |