Anonymous Anonymous - 1 month ago
314 0

No description

PowerShell

Get all links from list of webpages (specific)

$InputLinksFile = "c:\temp\InputLinks.txt"
$OutputLinksFile = "C:\temp\OutputLinks.txt"
$InputLinks = @()

$BasePage = "https://www.fanfiction.net/tv/Buffy-The-Vampire-Slayer/?&srt=2&lan=1&r=10&p="
[int]$FirstPageNumber = "600"
[int]$LastPageNumber = "601"
$CurrentPageNumber = $FirstPageNumber

# Make a list of all the pages we want to input, counting from FirstPageNumber to LastPageNumber
while ($CurrentPageNumber -le $LastPageNumber) {
	$InputLinks += "$BasePage$CurrentPageNumber"
	$CurrentPageNumber++
}

# If you want to manually input a list of pages instead, remove # in front of the next line:
#$InputLinks = Get-Content -Path $InputLinksFile

ForEach ($InputLink in $InputLinks) {
	# Fetch the entire page. Get links in page with ().Links. Page is compressed with gzip, so we'll have to account for that
	$InputPageLinks = (Invoke-WebRequest -Uri $InputLink -Headers @{"Accept-Encoding"="gzip"}).Links
	# Filter the link list to only contain links with the sequence "/1/" in it.
	$FilteredOutputLinks = $InputPageLinks | Where-Object {$_.href -like "*/1/*"}
	# The provided links are relative and not absolute, so we need to add the domain name to the output
	foreach ($OutputLink in $FilteredOutputLinks) {
		$FinalLink = "https://fanfiction.net$($Outputlink.href)"
		Out-File -Append -FilePath $OutputLinksFile -InputObject $FinalLink
	}
	Clear-Variable InputPageLinks
}