comparison pyspark/n3Parser.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e34cf1b6fe09
1 # Part of DML (Digital Music Laboratory)
2 #
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
12 #
13 # You should have received a copy of the GNU General Public
14 # License along with this library; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17 from rdflib import Graph
18 from rdflib.plugins.parsers.notation3 import BadSyntax
19 import warnings
20 import codecs
21 import platform
22
23 # Load and parse an n3 file
24 def get_rdf_graph_from_n3(n3_file_uri):
25
26 graph = Graph()
27
28 try:
29 graph.parse(n3_file_uri, format="n3")
30 except UnicodeDecodeError:
31
32 n3_file_str = uri2path(n3_file_uri)
33 n3_file_iso = codecs.open(n3_file_str, 'r', "iso-8859-1")
34
35 # check if n3 is valid and parse
36 # repair if necessary
37 graph = parse_potentially_corrupt_n3(n3_file_iso.read())
38
39 except (AssertionError, BadSyntax):
40
41 n3_file_str = uri2path(n3_file_uri)
42 n3_file = open(n3_file_str, 'r')
43 graph = parse_potentially_corrupt_n3(n3_file.read())
44
45 return graph
46
47 # can parse truncated n3
48 def parse_potentially_corrupt_n3(content):
49 feature_graph = Graph()
50 # test if file is complete.
51 # if not, delete the last corrupted entry
52 if not '.' in content[-4:]:
53 warnings.warn("Incomplete rdf file, ignoring last entry")
54 # we find the last correct event
55 lastentry = content.rfind(':event')
56 feature_graph.parse(data=content[:lastentry], format="n3")
57 else:
58 feature_graph.parse(data=content, format="n3")
59
60 return feature_graph
61
62 # returns filepath from url
63 def uri2path(n3_file_uri):
64
65 n3_file_uri_str = n3_file_uri.__str__()
66
67 # Assume that n3_file_uri_str starts with 'file://' - we need to remove that
68 if 'Win' in platform.system():
69 FILE_URI_START_INDEX = 8
70 else:
71 FILE_URI_START_INDEX = 7
72
73 n3_file_str = n3_file_uri_str[FILE_URI_START_INDEX:len(n3_file_uri_str)]
74 return n3_file_str