import re from typing import List import requests from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache from cachecontrol.heuristics import ExpiresAfter from lxml import html from rdflib import Literal from rdflib.namespace import RDF, RDFS import trygvis.eda.digikey.rdf def normalize_filename(part): return part.replace('/', '_').replace(' ', '_') def _clean(s): if s is None: return None s = s.strip() return None if len(s) == 0 else s class DigikeyDatabase(object): def __init__(self): self.productCategories = [] # type: List[DigikeyProductCategory] self.attributeTypes = {} def add_product_category(self, pc): self.productCategories.append(pc) def find_category(self, label): return next((c for c in self.productCategories if c.label == label), None) def find_sub_category_by_url(self, url): for p in self.productCategories: for sc in p.subCategories: if sc.url() == url: return sc return None def merge_attribute_types(self, attributeTypes): for a in attributeTypes: if a.id in self.attributeTypes: # TODO: implement merging continue self.attributeTypes[a.id] = a def find_type(self, id): return self.attributeTypes.get(id, None) class DigikeyProductCategory(object): def __init__(self, id, label, digikey_url=None, parent=None): self.id = _clean(id) self.label = _clean(label) self.digikey_url = digikey_url if digikey_url is None or digikey_url.startswith("http") else \ "http://www.digikey.com" + digikey_url self.parent = parent # type: DigikeyProductCategory self.subCategories = [] # type: List[DigikeyProductCategory assert self.id is not None assert self.label is not None def add_sub_category(self, id, label, digikey_url): sc = DigikeyProductCategory(id, label, digikey_url=digikey_url, parent=self) self.subCategories.append(sc) def find_sub_category_by_label(self, label): return next((sc for sc in self.subCategories if sc.label == label), None) def url(self): return rdf.DIGIKEY_PRODUCT_CATEGORY[self.id] def to_nodes(self): node = self.url() nodes = [ (node, RDF.type, rdf.DIGIKEY.productCategory), (node, RDFS.label, Literal(self.label)), ] if self.parent is not None: parentUrl = rdf.DIGIKEY_PRODUCT_CATEGORY[self.parent.id] nodes.append((node, rdf.DIGIKEY.parent, parentUrl)) if self.digikey_url is not None: nodes.append((node, rdf.DIGIKEY.url, Literal(self.digikey_url))) return nodes class DigikeyAttributeType(object): def __init__(self, category, id, label, options): self.category = category self.id = _clean(id) self.label = _clean(label) self.options = options assert self.category is not None assert self.id is not None assert self.label is not None assert self.options is not None def to_nodes(self): nodes = [] node = rdf.DIGIKEY_ATTRIBUTE_TYPE[self.id] nodes.append((node, RDF.type, rdf.DIGIKEY.attributeType)) nodes.append((node, RDFS.label, Literal(self.label))) for o in self.options: optionNode = rdf.DIGIKEY_ATTRIBUTE_VALUE[self.id + '-' + o.id] nodes.extend([ (optionNode, rdf.DIGIKEY.id, Literal(o.id)), (optionNode, RDFS.label, Literal(o.label)), (node, rdf.DIGIKEY.value, optionNode)]) return nodes class DigikeyAttributeValue(object): def __init__(self, id, label, type=None, type_id=None, type_label=None): self.id = _clean(id) self.label = _clean(label) self.type = type self.type_id = type_id self.type_label = type_label assert self.id is not None assert self.label is not None class DigikeyProduct(object): def __init__(self, part_id, part_number, values, categories): self.part_id = _clean(part_id) self.part_number = _clean(part_number) self.values = values self.categories = categories self.quantity_available = None self.description = None assert self.part_id is not None assert self.part_number is not None def to_nodes(self): nodes = [] node = rdf.DIGIKEY_PART[self.part_id] nodes.append((node, RDF.type, rdf.DIGIKEY.part)) nodes.append((node, rdf.DIGIKEY.partNumber, Literal(self.part_number))) nodes.append((node, RDFS.label, Literal(self.description))) for v in self.values: type_label = v.type.label if v.type is not None else v.type_label type_id = v.type.id if v.type is not None else v.type_id nodes.append((node, rdf.DIGIKEY['attribute-value'], rdf.DIGIKEY_ATTRIBUTE_VALUE[type_id + '-' + v.id])) for c in self.categories: nodes.append((node, rdf.DIGIKEY.category, c.url())) return nodes class DigikeyClient(object): def __init__(self): # TODO: this should be put under .eda-rdf/ cache = FileCache('digikey_cache', forever=True) self.sess = CacheControl(requests.Session(), cache=cache, heuristic=ExpiresAfter(days=1)) def req(self, url, params=None): if not url.startswith("http://"): url = "http://www.digikey.com" + url return self.sess.get(url, params=params) def _to_string(e): s = "" for t in e.itertext(): s += t return s.strip() def _id_from_url(url): if url is None: return None m = re.search(r".*/([0-9]+)", url) return m.group(1) if m else None def download_category_tree(database: DigikeyDatabase, client: DigikeyClient, baseurl="http://www.digikey.com/products/en"): page = client.req(baseurl) dom = html.fromstring(page.content) items = dom.xpath("//h2[contains(@class, 'catfiltertopitem')]") for h2 in items: label = _to_string(h2) # print(h2) pcId = None for a in h2.getchildren(): url = a.get('href') pcId = _id_from_url(url) if pcId is None: continue if pcId is None: continue pc = DigikeyProductCategory(pcId, label) n = h2.getnext() if n.tag == 'span': n = n.getnext() if n.tag == 'ul': for a in n.xpath('./li/a'): label = _to_string(a) url = a.get('href') id = _id_from_url(url) if id is None: continue # print(' ' + toString(a) + ', id=' + str(id) + ', url=' + url) pc.add_sub_category(id, label, url) database.add_product_category(pc) def download_attribute_types_from_category(category: DigikeyProductCategory, client: DigikeyClient) -> List[DigikeyAttributeType]: page = client.req(category.digikey_url) tree = html.fromstring(page.content) attributes = [] for form in tree.xpath("//form[contains(@class, 'search-form')]"): print('form: ' + str(form)) headers = form.xpath(".//tr[@id='appliedFilterHeaderRow']/th/text()") print("headers: " + str(headers)) for select in form.xpath(".//td/select[contains(@class, 'filter-selectors')]"): td = select.getparent() index = td.getparent().index(td) try: attributeLabel = headers[index] except: continue attributeId = select.get('name') print("label: " + attributeLabel + ", id: " + attributeId) options = [] type = DigikeyAttributeType(category, attributeId, attributeLabel, options) for o in select.xpath("./option"): id = o.get('value') label = _to_string(o) # print("o: %s" % str(o)) options.append(DigikeyAttributeValue(id, label, type=type)) attributes.append(type) return attributes def download_product(client: DigikeyClient, db, query): # http://www.digikey.com/products/en?x=0&y=0&lang=en&site=us&keywords=553-2320-1-ND page = client.req("http://www.digikey.com/products/en", params={'lang': 'en', 'site': 'us', 'keywords': query}) tree = html.fromstring(page.content) values = [] categories = [] for table in tree.xpath("//table[contains(@class, 'attributes-table-main')]"): for tr in table.xpath(".//tr"): if tr.get("id") is not None: continue tds = tr.xpath("./th | ./td") # print('tds: ' + str(tds)) # for x in tds: # print(_to_string(x)) if len(tds) != 3: continue type_label = _to_string(tds[0]) label = _to_string(tds[1]) type_id = value = None for input in tds[2].xpath("./input[@name]"): type_id = input.get("name") value = input.get("value") if value is None or type_id is None: continue if type_id == "t": # categories are handled later continue values.append(DigikeyAttributeValue(value, label, type_id=type_id, type_label=type_label)) for td in table.xpath(".//td[@class='attributes-td-categories-link']"): tr = td.getparent() a = next((a for a in td.xpath(".//a[@href]")), None) if a is None: continue label = "dummy" # a.text() url = a.get("href") if url is None: continue value = _id_from_url(url) if value is None: continue category_id = next((e.get("value") for e in tr.xpath(".//input[@name='t' and @value]")), None) categories.append(DigikeyProductCategory(category_id, label, digikey_url=url)) part_id = part_number = None for n in tree.xpath("//input[@name='partid' and @value]"): part_id = n.get("value") for n in tree.xpath("//*[@itemprop='productID' and @content]"): part_number = n.get("content") part_number = part_number.replace('sku:', '') p = DigikeyProduct(part_id, part_number, values, categories) for n in tree.xpath("//*[@itemprop='description']"): p.description = _to_string(n) return p