From 038152393727bfafc26f25e3e5c14e6f1219e07a Mon Sep 17 00:00:00 2001 From: Trygve Laugstøl Date: Wed, 16 Oct 2019 06:12:09 +0200 Subject: part: Adding category concept. Useful for later classifiying components. --- src/ee/digikey/__init__.py | 57 ++++++++++++++++++++---------------------- src/ee/digikey/search_parts.py | 23 +++++++++++++---- 2 files changed, 45 insertions(+), 35 deletions(-) (limited to 'src/ee/digikey') diff --git a/src/ee/digikey/__init__.py b/src/ee/digikey/__init__.py index a318024..b993c31 100644 --- a/src/ee/digikey/__init__.py +++ b/src/ee/digikey/__init__.py @@ -16,6 +16,7 @@ from selenium import webdriver import ee._utils from ee import EeException from ee.money import Money, get_default_context +from ee.part import Category from ee.tools import mk_parents money = get_default_context() @@ -135,12 +136,13 @@ class Document(object): @total_ordering class DigikeyProduct(object): - def __init__(self, part_number, mpn, url, attributes: List["DigikeyAttributeValue"] = None, categories=None): + def __init__(self, part_number, mpn, url, attributes: List["DigikeyAttributeValue"] = None, + categories: List[Category] = None): self.part_number = _clean(part_number) self.mpn = _clean(mpn) self.url = url self.attributes = attributes or [] # type: List["DigikeyAttributeValue"] - self.categories = categories or [] + self.categories = categories or [] # type: List[Category] self.quantity_available = None self.description = None self.price_breaks: List[PriceBreak] = [] @@ -158,6 +160,10 @@ class DigikeyProduct(object): def __hash__(self): return self.part_number.__hash__() + @property + def uri(self): + return "https://digikey.com/pn#{}".format(self.part_number) + def attribute_by_id(self, _id): return next((a for a in self.attributes if a.attribute_type.id == _id), None) @@ -228,33 +234,6 @@ class DigikeyAttributeValue(object): assert self.attribute_type -@total_ordering -class DigikeyProductCategory(object): - def __init__(self, _id, label, digikey_url=None, parent=None): - self.id = _clean(_id) - self.label = _clean(label) - self.digikey_url = digikey_url if digikey_url is None or digikey_url.startswith("http") else \ - "https://www.digikey.com" + digikey_url - self.parent: DigikeyProductCategory = parent - self.subCategories: List[DigikeyProductCategory] = [] - - assert self.id - assert self.label - - def __eq__(self, other: "DigikeyProductCategory"): - return self.id == other.id - - def __lt__(self, other: "DigikeyProductCategory") -> bool: - return self.label < other.label - - def add_sub_category(self, _id, label, digikey_url): - sc = DigikeyProductCategory(_id, label, digikey_url=digikey_url, parent=self) - self.subCategories.append(sc) - - def find_sub_category_by_label(self, label): - return next((sc for sc in self.subCategories if sc.label == label), None) - - class SearchResponseTypes(enum.Enum): MANY = 1 # A product table was returned. SINGLE = 2 # A product page was returned @@ -413,6 +392,24 @@ class DigikeyParser(object): a_type = self.digikey.get_attribute_type(attribute_type_id, label) attributes.append(DigikeyAttributeValue(value, a_type)) + # links = tree.xpath("//table[@id='product-attribute-table']//*[class='attributes-td-categories-link']") + links = tree.xpath("//*[@class='attributes-td-categories-link']/a") + parent = None + for a in links: + name = _clean(a.text) + href = _clean(a.get("href")) + if not href: + continue + m = re.fullmatch(".*/([0-9]+)", href) + href = self.ensure_absolute_url(origin_url, href) + + if m and name and href: + category_id = m.group(1) + uri = "https://digikey.com/category#{}".format(category_id) + c = Category(uri, name, href, parent) + parent = c + categories.append(c) + if part_number and mpn: p = DigikeyProduct(part_number, mpn, url, attributes, categories) p.price_breaks = self._parse_price_breaks(tree) @@ -565,7 +562,7 @@ class DigikeyParser(object): return DigikeySearchResponse(1, SearchResponseTypes.NO_MATCHES) @staticmethod - def ensure_absolute_url(origin_url, url): + def ensure_absolute_url(origin_url, url: Optional[str]): if url is None: return diff --git a/src/ee/digikey/search_parts.py b/src/ee/digikey/search_parts.py index eace4e3..1f1184a 100644 --- a/src/ee/digikey/search_parts.py +++ b/src/ee/digikey/search_parts.py @@ -1,11 +1,11 @@ import functools from pathlib import Path -from typing import List, MutableSet, Mapping +from typing import List, MutableSet, Mapping, Set from ee import EeException from ee.db import ObjDb from ee.digikey import Digikey, DigikeyParser, DigikeyClient, SearchResponseTypes, DigikeyProduct, DigikeyStore -from ee.part import PartDb, load_db, save_db, Part +from ee.part import PartDb, load_db, save_db, Part, Category from ee.tools import mk_parents from ee.xml import types, uris from ee.xml.uris import make_digikey_fact_key @@ -71,6 +71,7 @@ class QueryEngine(object): out_parts.add_index("spn", lambda p: [pn.value for pn in p.get_spns()], multiple=True) self.out_parts = out_parts + self.categories: Set[Category] = set() def pn_search(self, pn): return self.pn_spn_search(pn, False) @@ -78,6 +79,9 @@ class QueryEngine(object): def spn_search(self, pn): return self.pn_spn_search(pn, True) + def _collect_categories(self, product: DigikeyProduct): + [self.categories.add(c) for c in product.categories] + def pn_spn_search(self, pn, is_spn): s = "Searching for '{}'".format(pn) print(s, file=self.log) @@ -92,7 +96,9 @@ class QueryEngine(object): if response.response_type == SearchResponseTypes.EXCEPTION: result = "exception" elif response.response_type == SearchResponseTypes.SINGLE: - out_part = resolved(self.store.url, response.products[0]) + product = response.products[0] + out_part = resolved(self.store.url, product) + self._collect_categories(product) out_pn = out_part.get_exactly_one_spn() if is_spn else out_part.get_exactly_one_mpn() out_pn = out_pn.valueProp @@ -129,7 +135,9 @@ class QueryEngine(object): page = self.client.get_for_product_url(part.url, part.part_number) response = self.parser.parse_string(self.client.baseurl, page) if response.response_type == SearchResponseTypes.SINGLE: - out_part = resolved(self.store.url, response.products[0]) + product = response.products[0] + out_part = resolved(self.store.url, product) + self._collect_categories(product) result = "found" else: print("Unable to narrow down the part, got {} new products. Giving up.".format( @@ -153,9 +161,10 @@ class QueryEngine(object): def resolved(supplier, p: DigikeyProduct) -> Part: # TODO: fix uri - xml = types.Part(uri="https://digikey.com/pn#{}".format(p.part_number), + xml = types.Part(uri=p.uri, supplier=supplier, description=p.description, + category=types.CategoryList(), links=types.LinkList(), facts=types.FactList(), references=types.ReferenceList()) @@ -164,6 +173,9 @@ def resolved(supplier, p: DigikeyProduct) -> Part: if p.url: part.get_links().append(types.Link(url=p.url, relation="canonical", media_type="text/html")) + if len(p.categories): + xml.set_category(p.categories[-1].uri) + for d in p.documents: title = "{}: {}".format(d.section, d.title) relations = ["http://purl.org/ee/link-relation#documentation"] @@ -277,4 +289,5 @@ def run_search_parts(in_path: Path, out_path: Path, log, cache_dir: Path, store_ part_db = PartDb() for part in engine.out_parts: part_db.add_entry(part, True) + part_db.categories = engine.categories save_db(out_path, part_db, sort=True) -- cgit v1.2.3