Mercurial > hg > webaudioevaluationtool
annotate python/copyRemoteSaves.py @ 3140:7180d6a2a271
Add super basic web scraper to collect remote tests
author | Nicholas Jillings <nicholas.jillings@mail.bcu.ac.uk> |
---|---|
date | Wed, 24 Mar 2021 17:07:59 +0000 |
parents | |
children |
rev | line source |
---|---|
nicholas@3140 | 1 #!/usr/bin/python |
nicholas@3140 | 2 |
nicholas@3140 | 3 import xml.etree.ElementTree as ET |
nicholas@3140 | 4 import os |
nicholas@3140 | 5 import sys |
nicholas@3140 | 6 from lxml import html |
nicholas@3140 | 7 import requests |
nicholas@3140 | 8 |
nicholas@3140 | 9 |
nicholas@3140 | 10 url = input('Where is the remote WAET URL? ') |
nicholas@3140 | 11 output = input('Where am I saving all these? (Provide the full path using pwd to the saves directory) ') |
nicholas@3140 | 12 if output.endswith('/') == False: |
nicholas@3140 | 13 output = output + '/' |
nicholas@3140 | 14 if url.endswith('/saves/') == False and url.endswith('/saves') == False: |
nicholas@3140 | 15 if url.endswith('/') == False: |
nicholas@3140 | 16 url = url + '/' |
nicholas@3140 | 17 url = url + 'saves' |
nicholas@3140 | 18 print(url) |
nicholas@3140 | 19 page = requests.get(url) |
nicholas@3140 | 20 tree = html.fromstring(page.content) |
nicholas@3140 | 21 print(tree) |
nicholas@3140 | 22 ahref = tree.xpath('//a/text()') |
nicholas@3140 | 23 for a in ahref: |
nicholas@3140 | 24 if a.endswith('.xml'): |
nicholas@3140 | 25 r = requests.get(url+a, allow_redirects=True) |
nicholas@3140 | 26 open(output+a, 'wb').write(r.content) |
nicholas@3140 | 27 print(ahref) |