diff --git a/documentation/Get-PnPUnsuccesfulCrawledUrls.md b/documentation/Get-PnPUnsuccesfulCrawledUrls.md new file mode 100644 index 000000000..26cb0afc5 --- /dev/null +++ b/documentation/Get-PnPUnsuccesfulCrawledUrls.md @@ -0,0 +1,173 @@ +--- +Module Name: PnP.PowerShell +schema: 2.0.0 +applicable: SharePoint Online +online version: https://pnp.github.io/powershell/cmdlets/Get-PnPFooter.html +external help file: PnP.PowerShell.dll-Help.xml +title: Get-PnPFooter +--- + +# Get-PnPUnsuccesfulCrawledUrls + + +## SYNOPSIS + +Retrieve a list of URLs that failed to be indexed during a search crawl, which is useful for diagnosing search issues. +> Make sure you are granted access to the crawl log via the SharePoint search admin center at https://-admin.sharepoint.com/_layouts/15/searchadmin/crawllogreadpermission.aspx in order to run this cmdlet. + +## SYNTAX + +```powershell +Get-PnPUnsuccesfulCrawledUrls [-Filter ] [-StartDate ] [-EndDate ] [-RawFormat] + [-IncreaseRequestTimeout] + [-Connection ] +``` + +## DESCRIPTION + +Enables retrieval of items that failed to be indexed during a search crawl. This is particularly useful when processing large lists or libraries and encountering request timeouts. By focusing exclusively on errors, you can reliably identify issues without additional effort to narrow the query scope. + +### EXAMPLE 1 +```powershell +Get-PnPUnsuccesfulCrawledUrls +``` + +Returns all crawler log warnings and errors for site content. The amount of returned entries is limited by the request timeout. + +### EXAMPLE 2 +```powershell +Get-PnPUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" +``` +Returns crawl log warnings and errors for the specified site. The amount of returned entries is limited by the request timeout. + +### EXAMPLE 3 +```powershell +Get-PnPUnsuccesfulCrawledUrls -StartDate (Get-Date).AddDays(-10) +``` + +Returns crawler log warnings and errors, starting from 10 days ago. + +> Based on the author's test results and Copilot's input ๐Ÿ˜‰, the `DocumentCrawlLog` methods don't respect the __time__ component in `StartDate` and `EndDate`. They only __use the date__ portion for filtering. Internally, the crawl log is grouped by crawl day, so any hour/minute you provide is ignored. +The CSOM API (GetCrawledUrls) accepts DateTime values, but the backend partitions data by date, not timestamp. + +### EXAMPLE 4 +```powershell +$ClientID= "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +$env:SharePointPnPHttpTimeout = -1 #๐Ÿ‘ˆ + +Connect-PnPOnline -Url https://-admin.sharepoint.com/ -Interactive -ClientId $ClientID -ErrorAction Stop # ๐Ÿ‘ˆ + +Get-PnPUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" -IncreaseRequestTimeout +``` + +Increases the request timeout allowing the call to last up to 3 minutes. The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. + + +## PARAMETERS + +### -Connection +Optional connection to be used by the cmdlet. Retrieve the value for this parameter by either specifying -ReturnConnection on Connect-PnPOnline or by executing Get-PnPConnection. + +```yaml +Type: PnPConnection +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + + +### -EndDate +End date to stop getting entries from. Defaults to current time. + +```yaml +Type: DateTime +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + +### -Filter +Filter to limit what is being returned. Has to be a URL prefix for SharePoint content. Wildcard characters are not supported. + +```yaml +Type: String +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + +### -IncreaseRequestTimeout + +```yaml +Type: Switch +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + +### -RawFormat +Show raw crawl log data + +```yaml +Type: SwitchParameter +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + + +### -StartDate +Start date to start getting entries from. Defaults to start of time. + +```yaml +Type: DateTime +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + + + +Increases timeout to maximum 3 minutes. +The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. + +> Note: Before running Get-PnPUnsuccessfulCrawledUrls with -IncreaseRequestTimeout, you must set $env:SharePointPnPHttpTimeout = -1 to remove the default HttpClient timeout. Then establish a new PnP connection because the environment variable is only applied when the session initializes. + +```powershell +$ClientID= "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +$env:SharePointPnPHttpTimeout = -1 + +Connect-PnPOnline -Url https://-admin.sharepoint.com/ -Interactive -ClientId $ClientID -ErrorAction Stop + +$scope="https://contoso-my.sharepoint.com/sites/Intranet" +Get-PnPUnsuccesfulCrawledUrls -Filter $scope +``` + +## RELATED LINKS + +[Microsoft 365 Patterns and Practices](https://aka.ms/m365pnp) + diff --git a/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs b/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs new file mode 100644 index 000000000..53336a3f7 --- /dev/null +++ b/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs @@ -0,0 +1,183 @@ +๏ปฟusing System; +using System.Collections.Generic; +using System.Linq; +using System.Management.Automation; +using Microsoft.SharePoint.Client; +using Microsoft.SharePoint.Client.Search.Administration; +using PnP.PowerShell.Commands.Attributes; + +namespace PnP.PowerShell.Commands.Search +{ + public class UnsuccesfullCrawlEntry + { + public string Url { get; set; } + public DateTime CrawlTime { get; set; } + public DateTime ItemTime { get; set; } + public string Status { get; set; } + public int ErrorCode { get; set; } + public int ItemId { get; set; } + public DateTime LastTouchedTime { get; set; } + public string DatabaseName { get; set; } + } + + [Cmdlet(VerbsCommon.Get, "PnPUnsuccesfulCrawledUrls")] + [ApiNotAvailableUnderApplicationPermissions] + public class GetUnsuccesfulCrawledUrls : PnPWebCmdlet + { + [Parameter(Mandatory = false)] + public string Filter; + + [Parameter(Mandatory = false)] + //DateOnly used + public DateTime StartDate = DateTime.MinValue; + + [Parameter(Mandatory = false)] + //DateOnly used + public DateTime EndDate = DateTime.UtcNow.AddDays(1); + + [Parameter(Mandatory = false)] + public SwitchParameter RawFormat; + + [Parameter(Mandatory = false)] + public SwitchParameter IncreaseRequestTimeout; + + + private const int MaxRows = 100000; + + protected override void ExecuteCmdlet() + { + try + { + if(IncreaseRequestTimeout) + { + string timeoutValue = Environment.GetEnvironmentVariable("SharePointPnPHttpTimeout"); + if (string.IsNullOrEmpty(timeoutValue)) + { + LogWarning("The timeout may be only increased if the SharePointPnPHttpTimeout environment variable is set to 180000 or -1."); + LogWarning("Use $env:SharePointPnPHttpTimeout = -1 command and then, establish new connection with Connect-PnPOnline."); + return; + } + else + { + //Max 3 minutes, because Default CSOM timeout is 180,000 ms + ClientContext.RequestTimeout=3*60*1000; + } + } + var crawlLog = new DocumentCrawlLog(ClientContext, ClientContext.Site); + ClientContext.Load(crawlLog); + + + string postFilter = string.Empty; + if (string.IsNullOrWhiteSpace(Filter)) + { + Filter = $"https://{GetHostName()}.sharepoint.{PnP.Framework.AuthenticationManager.GetSharePointDomainSuffix(Connection.AzureEnvironment)}"; + } + + var logEntries = crawlLog.GetUnsuccesfulCrawledUrls(Filter, StartDate, EndDate); + ClientContext.ExecuteQueryRetry(); + + if (RawFormat) + { + var entries = new List(); + foreach (var dictionary in logEntries.Value.Rows) + { + string url = System.Net.WebUtility.UrlDecode(dictionary["FullUrl"].ToString()); + if (string.IsNullOrWhiteSpace(postFilter) || url.Contains(postFilter)) + { + entries.Add(ConvertToPSObject(dictionary)); + } + } + } + else + { + var entries = new List(logEntries.Value.Rows.Count); + foreach (var dictionary in logEntries.Value.Rows) + { + var entry = MapCrawlLogEntry(dictionary); + if (string.IsNullOrWhiteSpace(postFilter) || entry.Url.Contains(postFilter)) + { + entries.Add(entry); + } + } + + WriteObject(entries.OrderByDescending(i => i.CrawlTime).ToList(), true); + } + } + catch (Exception e) + { + if(e.Message=="The operation has timed out." ) + { + + LogError($"Error: {e.Message}. Default CSOM timeout is 180,000 ms (โ‰ˆ3 minutes). If you are querying large crawl logs or broad ranges, the server may take longer than that. "); + + } + else + { + LogError($"Error: {e.Message}. Make sure you are granted access to the crawl log via the SharePoint search admin center at https://-admin.sharepoint.com/_layouts/15/searchadmin/crawllogreadpermission.aspx"); + } + } + } + +#region Helper functions + + private string GetHostName() + { + return new Uri(ClientContext.Url).Host.Replace("-admin", "").Replace("-public", "").Replace("-my", "").Replace($".sharepoint.{PnP.Framework.AuthenticationManager.GetSharePointDomainSuffix(Connection.AzureEnvironment)}", ""); + } + + private int GetContentSourceIdForSites(DocumentCrawlLog crawlLog) + { + var hostName = GetHostName(); + var spContent = crawlLog.GetCrawledUrls(false, 10, $"https://{hostName}.sharepoint.{PnP.Framework.AuthenticationManager.GetSharePointDomainSuffix(Connection.AzureEnvironment)}/sites", true, -1, (int)LogLevel.All, -1, DateTime.Now.AddDays(-100), DateTime.Now.AddDays(1)); + ClientContext.ExecuteQueryRetry(); + if (spContent.Value.Rows.Count > 0) return (int)spContent.Value.Rows.First()["ContentSourceID"]; + return -1; + } + + private int GetContentSourceIdForUserProfiles(DocumentCrawlLog crawlLog) + { + var hostName = GetHostName(); + var peopleContent = crawlLog.GetCrawledUrls(false, 100, $"sps3s://{hostName}-my.sharepoint.{PnP.Framework.AuthenticationManager.GetSharePointDomainSuffix(Connection.AzureEnvironment)}", true, -1, (int)LogLevel.All, -1, DateTime.Now.AddDays(-100), DateTime.Now.AddDays(1)); + ClientContext.ExecuteQueryRetry(); + if (peopleContent.Value.Rows.Count > 0) return (int)peopleContent.Value.Rows.First()["ContentSourceID"]; + return -1; + } + + private static UnsuccesfullCrawlEntry MapCrawlLogEntry(Dictionary dictionary) + { + var entry = new UnsuccesfullCrawlEntry + { + ItemId = (int)dictionary["DocID"], + Url = dictionary["FullUrl"].ToString(), + CrawlTime = (DateTime)dictionary["TimeStamp"], + LastTouchedTime= (DateTime)dictionary["LastTouchedTime"], + DatabaseName= (string)dictionary["DatabaseName"] + }; + var time=dictionary["SPItemModifiedTime"]+"" ?? dictionary["LastModifiedTime"]+"" ??""; + long.TryParse(time, out long ticks); + if (ticks != 0) + { + var itemDate = DateTime.FromFileTimeUtc(ticks); + entry.ItemTime = itemDate; + } + + entry.Status = (dictionary["ErrorDesc"]??"").ToString(); + entry.ErrorCode = int.Parse(dictionary["ErrorCode"]+""); + return entry; + } + + private object ConvertToPSObject(IDictionary r) + { + PSObject res = new PSObject(); + if (r != null) + { + foreach (var kvp in r) + { + res.Properties.Add(new PSNoteProperty(kvp.Key, kvp.Value)); + } + } + return res; + } +#endregion + } +} diff --git a/src/Tests/Search/GetUnsuccesfulCrawledUrlsTests.cs b/src/Tests/Search/GetUnsuccesfulCrawledUrlsTests.cs new file mode 100644 index 000000000..40835ae88 --- /dev/null +++ b/src/Tests/Search/GetUnsuccesfulCrawledUrlsTests.cs @@ -0,0 +1,89 @@ +using System; +using Microsoft.VisualStudio.TestTools.UnitTesting; +using System.Management.Automation.Runspaces; + +namespace PnP.PowerShell.Tests.Search +{ + [TestClass] + public class GetUnsuccesfulCrawledUrlsTests + { + #region Test Setup/CleanUp + [ClassInitialize] + public static void Initialize(TestContext testContext) + { + // This runs on class level once before all tests run + //using (var ctx = TestCommon.CreateClientContext()) + //{ + //} + } + + [ClassCleanup] + public static void Cleanup(TestContext testContext) + { + // This runs on class level once + //using (var ctx = TestCommon.CreateClientContext()) + //{ + //} + } + + [TestInitialize] + public void Initialize() + { + using (var scope = new PSTestScope()) + { + // Example + // scope.ExecuteCommand("cmdlet", new CommandParameter("param1", prop)); + } + } + + [TestCleanup] + public void Cleanup() + { + using (var scope = new PSTestScope()) + { + try + { + // Do Test Setup - Note, this runs PER test + } + catch (Exception) + { + // Describe Exception + } + } + } + #endregion + + #region Scaffolded Cmdlet Tests + //TODO: This is a scaffold of the cmdlet - complete the unit test + //[TestMethod] + public void GetPnPUnsuccesfulCrawledUrlsTest() + { + using (var scope = new PSTestScope(true)) + { + // Complete writing cmd parameters + + // From Cmdlet Help: Filter to limit what is being returned. Has to be a URL prefix for SharePoint content, and part of a user principal name for user profiles. Wildcard characters are not supported. + var filter = ""; + // From Cmdlet Help: Start date to start getting entries from. Defaults to start of time. + var startDate = ""; + // From Cmdlet Help: End date to stop getting entries from. Default to current time. + var endDate = ""; + // From Cmdlet Help: Show raw crawl log data + var rawFormat = ""; + // From Cmdlet Help: Increases the request timeout for this command to accommodate large result sets + var increaseRequestTimeout = ""; + + var results = scope.ExecuteCommand("Get-PnPUnsuccesfulCrawledUrls", + new CommandParameter("Filter", filter), + new CommandParameter("StartDate", startDate), + new CommandParameter("EndDate", endDate), + new CommandParameter("RawFormat", rawFormat)); + new CommandParameter("IncreaseRequestTimeout", increaseRequestTimeout); + + Assert.IsNotNull(results); + } + } + #endregion + } +} + \ No newline at end of file