# HG changeset patch # User Nicholas Jillings # Date 1616605679 0 # Node ID 7180d6a2a271ecd3a3992309a43c162ec653e8ae # Parent bc0ef78bb07acd722291f79e84a582f351c97051 Add super basic web scraper to collect remote tests diff -r bc0ef78bb07a -r 7180d6a2a271 python/copyRemoteSaves.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/python/copyRemoteSaves.py Wed Mar 24 17:07:59 2021 +0000 @@ -0,0 +1,27 @@ +#!/usr/bin/python + +import xml.etree.ElementTree as ET +import os +import sys +from lxml import html +import requests + + +url = input('Where is the remote WAET URL? ') +output = input('Where am I saving all these? (Provide the full path using pwd to the saves directory) ') +if output.endswith('/') == False: + output = output + '/' +if url.endswith('/saves/') == False and url.endswith('/saves') == False: + if url.endswith('/') == False: + url = url + '/' + url = url + 'saves' +print(url) +page = requests.get(url) +tree = html.fromstring(page.content) +print(tree) +ahref = tree.xpath('//a/text()') +for a in ahref: + if a.endswith('.xml'): + r = requests.get(url+a, allow_redirects=True) + open(output+a, 'wb').write(r.content) +print(ahref) \ No newline at end of file