Skip to content

Commit 6d5e2c7

Browse files
authored
Merge pull request #942 from reshmee011/crawl_search
new sample script comparing crawl versus search results
2 parents 0c6e98c + c8be14f commit 6d5e2c7

4 files changed

Lines changed: 296 additions & 0 deletions

File tree

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
# Find Files which are not searchable
2+
3+
The script gets the crawl log information to determine indexing recency and check whether the file is actually searchable. The underlying cause can vary, and in many cases you may need to raise a support case with Microsoft for deeper investigation if reindexing the site or library does not fix the issues.
4+
5+
## Summary
6+
7+
# [PnP PowerShell](#tab/pnpps)
8+
9+
```powershell
10+
cls
11+
#Requires -Modules PnP.PowerShell
12+
Clear-Host
13+
14+
# ===== Settings =====
15+
$clientId = "xxxxxx"
16+
$dateTime = Get-Date -Format "yyyy-MM-dd-HH-mm-ss"
17+
$tenantUrl = "https://contoso.sharepoint.com"
18+
19+
# NEW: File extensions to exclude (case-insensitive) as they can't be searched using their Path metadata, e.g. Path:FileUrl
20+
$ExcludedExtensions = @('.png', '.jpg', '.jpeg', '.xltx', '.one', '.onetoc2', '.gif','.mp4','.agent')
21+
22+
$invocation = Get-Variable -Name MyInvocation -ValueOnly
23+
$directoryPath = Split-Path $invocation.MyCommand.Path
24+
$csvPath = Join-Path $directoryPath "sites1.csv" # CSV must have a column 'SiteUrl' containing a list of site urls
25+
26+
# Ensure output folder exists
27+
$outputFolder = Join-Path $directoryPath "output_files"
28+
if (-not (Test-Path $outputFolder)) { New-Item -ItemType Directory -Path $outputFolder | Out-Null }
29+
$outputCsv = Join-Path $outputFolder ("NonSearchableIndexable-" + $dateTime + ".csv")
30+
31+
# Lists/libraries to exclude
32+
$ExcludedLists = @(
33+
"Access Requests","App Packages","appdata","appfiles","Apps in Testing","Cache Profiles","Composed Looks",
34+
"Content and Structure Reports","Content type publishing error log","Converted Forms","Device Channels",
35+
"Form Templates","fpdatasources","Get started with Apps for Office and SharePoint","List Template Gallery",
36+
"Long Running Operation Status","Maintenance Log Library","Images","site collection images","Master Docs",
37+
"Master Page Gallery","MicroFeed","NintexFormXml","Quick Deploy Items","Relationships List","Reusable Content",
38+
"Reporting Metadata","Reporting Templates","Search Config List","Site Assets","Preservation Hold Library",
39+
"Site Pages","Solution Gallery","Style Library","Suggested Content Browser Locations","Theme Gallery",
40+
"TaxonomyHiddenList","User Information List","Web Part Gallery","wfpub","wfsvc","Workflow History",
41+
"Workflow Tasks","Pages"
42+
)
43+
44+
# ===== Safety checks =====
45+
if (-not (Test-Path $csvPath)) {
46+
Write-Error "CSV not found at $csvPath. Ensure it exists and includes a 'SiteUrl' column."
47+
exit 1
48+
}
49+
50+
# ===== Helpers =====
51+
function Normalize-Url {
52+
param([string]$Url)
53+
if ([string]::IsNullOrWhiteSpace($Url)) { return $null }
54+
return ($Url.Trim().TrimEnd('/') ).ToLowerInvariant()
55+
}
56+
function Get-UrlVariants {
57+
param([string]$Url)
58+
if ([string]::IsNullOrWhiteSpace($Url)) { return @() }
59+
$u = $Url.Trim()
60+
$variants = New-Object System.Collections.Generic.List[string]
61+
$variants.Add((Normalize-Url $u))
62+
# Add encoded/decode space variants
63+
$variants.Add((Normalize-Url ($u -replace ' ', '%20')))
64+
$variants.Add((Normalize-Url ($u -replace '%20', ' ')))
65+
$variants | Where-Object { $_ } | Select-Object -Unique
66+
}
67+
68+
# ===== Collect results =====
69+
$results = New-Object System.Collections.Generic.List[object]
70+
$sites = Import-Csv -Path $csvPath # expects column "SiteUrl"
71+
72+
foreach ($s in $sites) {
73+
$siteUrl = $s.SiteUrl
74+
if ([string]::IsNullOrWhiteSpace($siteUrl)) { continue }
75+
76+
Write-Host "Connecting to site: $siteUrl" -ForegroundColor Cyan
77+
78+
try {
79+
# Connect interactively with the client ID
80+
Connect-PnPOnline -ClientId $clientId -Url $siteUrl -Interactive
81+
82+
# Get only visible document libraries (exclude hidden/system libraries)
83+
$libraries = Get-PnPList -Includes BaseType, BaseTemplate, Hidden, Title, ItemCount, RootFolder `
84+
| Where-Object {
85+
$_.Hidden -eq $false -and
86+
$_.BaseType -eq "DocumentLibrary" -and
87+
$_.Title -notin $ExcludedLists
88+
}
89+
90+
foreach ($library in $libraries) {
91+
$libraryAbsUrl = ($tenantUrl.TrimEnd('/')) + $library.RootFolder.ServerRelativeUrl
92+
Write-Host " Library: $($library.Title)" -ForegroundColor Yellow
93+
94+
# Pull only fields we need and page for large lists
95+
$listItems = Get-PnPListItem -List $library -PageSize 500 `
96+
-Fields "FileRef","FSObjType" `
97+
-ErrorAction SilentlyContinue
98+
99+
# ==== SEARCH RESULTS (library scope) ====
100+
$kql = "Path:`"$libraryAbsUrl`""
101+
$searchresults = $null
102+
try {
103+
$searchresults = Submit-PnPSearchQuery `
104+
-Query $kql `
105+
-All `
106+
-SelectProperties @("Title","Path","LastModifiedTime") `
107+
-SortList @{ "LastModifiedTime" = "Descending" } `
108+
-ErrorAction SilentlyContinue
109+
} catch {}
110+
111+
# Build a fast lookup of paths from search results
112+
$searchPathSet = New-Object 'System.Collections.Generic.HashSet[string]'
113+
if ($searchresults) {
114+
$searchRows = @()
115+
if ($searchresults.ResultRows) { $searchRows = $searchresults.ResultRows }
116+
117+
foreach ($row in $searchRows) {
118+
$p = $null
119+
if ($row -is [System.Collections.IDictionary]) { $p = [string]$row["Path"] }
120+
elseif ($row.PSObject.Properties.Match("Path")) { $p = [string]$row.Path }
121+
if ($p) {
122+
# OPTIONAL: skip excluded extensions to keep the set cleaner
123+
$ext = [System.IO.Path]::GetExtension($p)
124+
if ($ext -and ($ExcludedExtensions -contains $ext.ToLower())) { continue }
125+
$null = $searchPathSet.Add((Normalize-Url $p))
126+
}
127+
}
128+
}
129+
130+
# ==== CRAWL LOG (library scope) ====
131+
$crawlresults = $null
132+
$crawlMap = @{} # url (normalized) -> [DateTime] max last indexed time
133+
try {
134+
$crawlresults = Get-PnPSearchCrawlLog -Filter $libraryAbsUrl -RowLimit (($library.ItemCount * 2)+10)
135+
if ($crawlresults) {
136+
foreach ($cr in $crawlresults) {
137+
$urlVal = $cr.Url
138+
if (-not $urlVal) { continue }
139+
140+
# OPTIONAL: skip excluded extensions here as well
141+
$ext = [System.IO.Path]::GetExtension($urlVal)
142+
if ($ext -and ($ExcludedExtensions -contains $ext.ToLower())) { continue }
143+
144+
$lastIdx = $null
145+
try { $lastIdx = [datetime]$cr.CrawlTime } catch {}
146+
147+
$nUrl = Normalize-Url $urlVal
148+
if ($nUrl) {
149+
if (-not $crawlMap.ContainsKey($nUrl)) {
150+
$crawlMap[$nUrl] = $lastIdx
151+
} else {
152+
if ($lastIdx -and $crawlMap[$nUrl] -and ($lastIdx -gt $crawlMap[$nUrl])) {
153+
$crawlMap[$nUrl] = $lastIdx
154+
} elseif ($lastIdx -and -not $crawlMap[$nUrl]) {
155+
$crawlMap[$nUrl] = $lastIdx
156+
}
157+
}
158+
}
159+
}
160+
}
161+
} catch {
162+
Write-Verbose "Crawl log query failed for $libraryAbsUrl : $($_.Exception.Message)"
163+
}
164+
165+
# ==== Evaluate each file ====
166+
foreach ($item in $listItems) {
167+
# FSObjType: 0=file, 1=folder
168+
if ($item.FieldValues["FSObjType"] -ne 0) { continue }
169+
170+
$serverRelative = $item.FieldValues["FileRef"]
171+
if ([string]::IsNullOrWhiteSpace($serverRelative)) { continue }
172+
173+
# NEW: Skip unwanted extensions up front
174+
$ext = [System.IO.Path]::GetExtension($serverRelative)
175+
if ($ext -and ($ExcludedExtensions -contains $ext.ToLower())) { continue }
176+
177+
$fullUrl = ($tenantUrl.TrimEnd('/')) + $serverRelative
178+
$urlVariants = Get-UrlVariants -Url $fullUrl
179+
180+
# SEARCHABLE? (if any variant appears in search results)
181+
$searchable = "No"
182+
foreach ($v in $urlVariants) {
183+
if ($searchPathSet.Contains($v)) { $searchable = "Yes"; break }
184+
}
185+
186+
# INDEXED? (if any variant appears in crawl log map)
187+
$indexed = "No"
188+
$lastIndexedTime = $null
189+
foreach ($v in $urlVariants) {
190+
if ($crawlMap.ContainsKey($v)) {
191+
$indexed = "Yes"
192+
$lastIndexedTime = $crawlMap[$v]
193+
break
194+
}
195+
}
196+
197+
if (!($indexed -eq "Yes" -and $searchable -eq "Yes")) {
198+
$results.Add([pscustomobject]@{
199+
SiteUrl = $siteUrl
200+
LibraryTitle = $library.Title
201+
LibraryUrl = $libraryAbsUrl
202+
FileServerRelativePath = $serverRelative
203+
FullUrl = $fullUrl
204+
Indexed = $indexed
205+
LastIndexedTime = $lastIndexedTime
206+
Searchable = $searchable
207+
})
208+
}
209+
}
210+
}
211+
}
212+
catch {
213+
Write-Warning "Failed on site $siteUrl. Error: $($_.Exception.Message)"
214+
continue
215+
}
216+
}
217+
# ===== Export =====
218+
$results | Export-Csv -Path $outputCsv -NoTypeInformation -Encoding UTF8
219+
Write-Host "Export complete: $outputCsv" -ForegroundColor Green
220+
221+
```
222+
223+
[!INCLUDE [More about PnP PowerShell](../../docfx/includes/MORE-PNPPS.md)]
224+
225+
226+
## Source Credit
227+
228+
Sample idea first appeared on [Get-PnPSearchCrawlLog. Search as alternatives with gotchas. Not a replacement for crawl log](https://reshmeeauckloo.com/posts/powershell-sharepoint-files-indexable-searchable/).
229+
230+
## Contributors
231+
232+
| Author(s) |
233+
|-----------|
234+
| [Reshmee Auckloo](https://github.com/reshmee011) |
235+
236+
237+
[!INCLUDE [DISCLAIMER](../../docfx/includes/DISCLAIMER.md)]
238+
<img src="https://m365-visitor-stats.azurewebsites.net/script-samples/scripts/spo-crawllog-indexed-searchable" aria-hidden="true" />
83.2 KB
Loading
58.7 KB
Loading
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
[
2+
{
3+
"name": "spo-crawllog-indexed-searchable",
4+
"source": "pnp",
5+
"title": "SharePoint Search versus crawl log",
6+
"shortDescription": "SharePoint Search versus crawl log",
7+
"url": "https://pnp.github.io/script-samples/spo-crawllog-indexed-searchable/README.html",
8+
"longDescription": [
9+
"SharePoint Search versus crawl log for files not searchable"
10+
],
11+
"creationDateTime": "2026-04-10",
12+
"updateDateTime": "2026-04-10",
13+
"products": [
14+
"SharePoint",
15+
"Search",
16+
"Crawl Log"
17+
],
18+
"metadata": [
19+
{
20+
"key": "PNP-POWERSHELL",
21+
"value": "3.1.0"
22+
}
23+
],
24+
"categories": [
25+
"Search",
26+
"Crawl Log"
27+
],
28+
"tags": [
29+
"Connect-PnPOnline",
30+
"Get-PnPList",
31+
"Get-PnPSearchCrawlLog",
32+
"Submit-PnPSearchQuery"
33+
],
34+
"thumbnails": [
35+
{
36+
"type": "image",
37+
"order": 100,
38+
"url": "https://raw.githubusercontent.com/pnp/script-samples/main/scripts/spo-crawllog-indexed-searchable/assets/example.png",
39+
"alt": "Preview of the issue"
40+
}
41+
],
42+
"authors": [
43+
{
44+
"gitHubAccount": "reshmee011",
45+
"company": "",
46+
"pictureUrl": "https://github.com/reshmee011.png",
47+
"name": "Reshmee Auckloo"
48+
}
49+
],
50+
"references": [
51+
{
52+
"name": "Want to learn more about PnP PowerShell and the cmdlets",
53+
"description": "Check out the PnP PowerShell site to get started and for the reference to the cmdlets.",
54+
"url": "https://aka.ms/pnp/powershell"
55+
}
56+
]
57+
}
58+
]

0 commit comments

Comments
 (0)