import sha import urllib import xml.dom.minidom as minidom import RDF import config import StringIO import gzip class Grab(urllib.URLopener): def __init__(self, *args): self.version = 'SemWebAgent (julie/redlandbot 0.5)' urllib.URLopener.__init__(self, *args) urllib._urlopener = Grab() urllib._urlopener.addheader("Accept","application/rdf+xml,*/*") debug = 0 common_namespaces = { 'owl': 'http://www.w3.org/2002/07/owl#', 'wn16': 'http://xmlns.com/wordnet/1.6/', 'prf': 'http://www.openmobilealliance.org/tech/profiles/UAPROF/ccppschema-20021212#', 'doaml': 'http://ns.balbinus.net/doaml#', 'periodic': 'http://www.daml.org/2003/01/periodictable/PeriodicTable#', 'tipjar': 'http://crschmidt.net/foaf/tipjar/', 'factbook': 'http://www.daml.org/2001/12/factbook/factbook-ont#', 'srw10': 'http://purl.org/net/inkel/rdf/schemas/lang/1.0#', 'srw11': 'http://purl.org/net/inkel/rdf/schemas/lang/1.1#', 'rank': 'http://www.daml.org/2002/03/ranks/rank-ont#', 'list': 'http://crschmidt.net/ns/list#', 'weather': 'http://purl.org/net/vocab/2004/10/weather#', 'irc': 'http://crschmidt.net/ns/irc#', 'menow': 'http://schema.menow.org/#', 'kissed': 'http://www.gnowsis.org/ont/kissology', 'todo': 'http://crschmidt.net/ns/todo#', 'astrology': 'http://www.ideaspace.net/users/wkearney/schema/astrology/0.1#', 'trust': 'http://trust.mindswap.org/ont/trust.owl#', 'tz': 'http://www.isi.edu/~pan/damltime/timezone-ont.owl#', 'exif': 'http://www.w3.org/2003/12/exif/ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'tv': 'http://xmlns.com/2003/rdftv/tv#', 'wot': 'http://xmlns.com/wot/0.1/', 'review': 'http://www.purl.org/stuff/rev#', 'mtbi': 'http://www.ideaspace.net/users/wkearney/schema/mbti/0.1#', 'rev': 'http://www.purl.org/stuff/rev#', 'ilike': 'http://rdf.netalleynetworks.com/ilike/20040830#', 'usranks': 'http://www.daml.org/2002/03/ranks/us#', 'content': 'is http://purl.org/rss/1.0/modules/content/', 'cc': 'http://web.resource.org/cc/', 'book': 'http://www.hackcraft.net/bookrdf/vocab/0_1/', 'music': 'http://www.kanzaki.com/ns/music#', 'rel': 'http://purl.org/vocab/relationship/', 'nyse': 'http://www.daml.org/2001/10/html/nyse-ont#', 'ref': 'http://mozref.com/2004/07/14-object-reference#', 'dcterms': 'http://purl.org/dc/terms/', 'quaff': 'http://purl.org/net/schemas/quaffing/', 'schema': 'http://www.schemaweb.info/schemas/meta/rdf/', 'foaf': 'http://xmlns.com/foaf/0.1/', 'bio': 'http://purl.org/vocab/bio/0.1/', 'ical': 'http://www.w3.org/2002/12/cal/ical#', 'foafcorp': 'http://xmlns.com/foaf/corp#', 'bloggercode': 'http://purl.org/net/bloggercode/', 'xfn': 'http://gmpg.org/xfn/1#', 'quote': 'http://homepage.mac.com/kpreid/2003/10/rdf-quote.xml#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'doap': 'http://usefulinc.com/ns/doap#', 'dc': 'http://purl.org/dc/elements/1.1/', 'cyc': 'http://opencyc.sourceforge.net/daml/cyc.daml#', 'fc': 'http://xmlns.com/foaf/corp#', 'fips': 'http://www.daml.org/2001/09/countries/fips#', 'kiss': 'http://www.gnowsis.org/ont/kissology#', 'rss': 'http://purl.org/rss/1.0/', 'dct': 'is http://purl.org/dc/terms/', 'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#', 'vcard': 'http://www.w3.org/2001/vcard-rdf/3.0#', 'vann': 'http://purl.org/vocab/vann/', 'dc10': 'http://purl.org/dc/1.0/', 'chiefs': 'http://www.daml.org/2002/02/chiefs/chiefs-ont#', 'foo': 'http://www.w3.org/2005/01/wf/assign#', 'rss11': 'http://purl.org/net/rss1.1#', 'img': 'http://jibbering.com/2002/3/svg/#', 'chefmoz': 'http://chefmoz.org/rdf/elements/1.0/', 'area': 'http://www.daml.org/2002/02/telephone/1/areacodes-ont#', 'flow': 'http://www.w3.org/2005/01/wf/flow#', 'p': 'http://purl.org/net/rss1.1/payload#', 'contact': 'http://www.w3.org/2000/10/swap/pim/contact#', 'iso': 'http://www.daml.org/2001/09/countries/iso#', 'bot': 'http://crschmidt.net/ns/bot#', 'xsd': 'http://www.w3.org/2001/XMLSchema#', 'witw-is': 'http://nwalsh.com/xmlns/witw-is#', 'cyc2004': 'http://www.cyc.com/2004/06/04/cyc#', 'pm': 'http://www.pm.org/rdf/0.1/', 'iso3166': 'http://www.daml.org/2001/09/countries/iso-3166-ont#', 'mozrdf': 'http://www.mozilla.org/2004/em-rdf#', 'fact': 'http://www.daml.org/2001/12/factbook/factbook-ont#' } def get_model(options=None): if (options): options = ",%s" % options return RDF.Model(RDF.Storage(storage_name="mysql", name="abc", options_string="merge='yes',host='127.0.0.1',user='%s',password='%s',database='redland'%s" % (config.username, config.password, options))) def count_statements(m, uri): j = 0 for i in m.find_statements(RDF.Statement(None, RDF.Uri(uri), None)): j += 1 return j def rdqlquery(m, query): query = "%s USING rdf for " % query for i in common_namespaces.keys(): query = "%s, %s for <%s>" % (query, i, common_namespaces[i]) q = RDF.RDQLQuery(query) return q.execute(m) def parse_anything(m, string): url = "http://example.org/unknowndatasource" if (string[0:5] == "http:"): url = string u = urllib.urlopen(string) ct = u.info().get("Content-Type") content = u.read() u.close() if (ct.startswith("application/x-gzip")): s = StringIO.StringIO(content) g = gzip.GzipFile(fileobj=s) url = string string = g.read() if debug: print "Gzip" elif (ct.startswith("text/html")): g = RDF.Parser(name="grddl") if debug: print 'grddl' g.parse_into_model(m,string) return 'GRDDL' elif (ct.startswith("application/x-turtle") or string[-4:] == ".ttl" or string[-6:] == "turtle"): t = RDF.TurtleParser() if debug: print "turtle" t.parse_string_into_model(m, content,string) return "turtle" elif (ct.startswith("application/rdf+xml")): r = RDF.RDFXMLParser() if debug: print "rdf" r.parse_string_into_model(m, content, string) return "RDF/XML" elif (ct.startswith("image/svg+xml") or string[-4:] == ".svg"): p = RDF.Parser() p.set_feature("http://feature.librdf.org/raptor-scanForRDF", "1") p.parse_string_into_model(m, content,string) return "SVG" elif (string[-3:] == ".nt"): nt = RDF.NTriplesParser() if debug: print "ntriples" nt.parse_string_into_model(m, content, string) return "ntriples" elif (ct.startswith("application/atom+xml") or ct.startswith("application/rss+xml")): rss = RDF.RSSTagSoupParser() if debug: print "rss" rss.parse_string_into_model(m, content, string) return "RSS" elif(ct.startswith("text/xml") and string[-3:] == "rss"): rss = RDF.RSSTagSoupParser() if debug: print "rss" rss.parse_string_into_model(m, content, string) return "RSS" elif(ct.startswith("text/xml")): try: rss = RDF.RSSTagSoupParser() if debug: print "rss" rss.parse_string_into_model(m, content, string) return "RSS" except RDF.RedlandError: r = RDF.RDFXMLParser() if debug: print "rdf" r.parse_string_into_model(m, content, string) return "RDF/XML" else: string = content if ("@prefix " in string): t = RDF.TurtleParser() if debug: print "turtle" t.parse_string_into_model(m, string, url) return "turtle from string" elif ("xmlns=" in string or "xmlns:rdf=" in string): r = RDF.RDFXMLParser() if debug: print "rdf" r.parse_string_into_model(m, string, url) return "RDF/XML from string" elif (string[0] == "