annotate python/copyRemoteSaves.py @ 3140:7180d6a2a271

Add super basic web scraper to collect remote tests
author Nicholas Jillings <nicholas.jillings@mail.bcu.ac.uk>
date Wed, 24 Mar 2021 17:07:59 +0000
parents
children
rev   line source
nicholas@3140 1 #!/usr/bin/python
nicholas@3140 2
nicholas@3140 3 import xml.etree.ElementTree as ET
nicholas@3140 4 import os
nicholas@3140 5 import sys
nicholas@3140 6 from lxml import html
nicholas@3140 7 import requests
nicholas@3140 8
nicholas@3140 9
nicholas@3140 10 url = input('Where is the remote WAET URL? ')
nicholas@3140 11 output = input('Where am I saving all these? (Provide the full path using pwd to the saves directory) ')
nicholas@3140 12 if output.endswith('/') == False:
nicholas@3140 13 output = output + '/'
nicholas@3140 14 if url.endswith('/saves/') == False and url.endswith('/saves') == False:
nicholas@3140 15 if url.endswith('/') == False:
nicholas@3140 16 url = url + '/'
nicholas@3140 17 url = url + 'saves'
nicholas@3140 18 print(url)
nicholas@3140 19 page = requests.get(url)
nicholas@3140 20 tree = html.fromstring(page.content)
nicholas@3140 21 print(tree)
nicholas@3140 22 ahref = tree.xpath('//a/text()')
nicholas@3140 23 for a in ahref:
nicholas@3140 24 if a.endswith('.xml'):
nicholas@3140 25 r = requests.get(url+a, allow_redirects=True)
nicholas@3140 26 open(output+a, 'wb').write(r.content)
nicholas@3140 27 print(ahref)