From 038152393727bfafc26f25e3e5c14e6f1219e07a Mon Sep 17 00:00:00 2001 From: Trygve Laugstøl Date: Wed, 16 Oct 2019 06:12:09 +0200 Subject: part: Adding category concept. Useful for later classifiying components. --- src/ee/digikey/__init__.py | 57 ++++---- src/ee/digikey/search_parts.py | 23 +++- src/ee/part/__init__.py | 61 ++++++++- src/ee/tools/part_apply_souffle_post.py | 4 +- src/ee/xml/types.py | 231 +++++++++++++++++++++++++++++++- xsd/ee.xsd | 17 +++ 6 files changed, 352 insertions(+), 41 deletions(-) diff --git a/src/ee/digikey/__init__.py b/src/ee/digikey/__init__.py index a318024..b993c31 100644 --- a/src/ee/digikey/__init__.py +++ b/src/ee/digikey/__init__.py @@ -16,6 +16,7 @@ from selenium import webdriver import ee._utils from ee import EeException from ee.money import Money, get_default_context +from ee.part import Category from ee.tools import mk_parents money = get_default_context() @@ -135,12 +136,13 @@ class Document(object): @total_ordering class DigikeyProduct(object): - def __init__(self, part_number, mpn, url, attributes: List["DigikeyAttributeValue"] = None, categories=None): + def __init__(self, part_number, mpn, url, attributes: List["DigikeyAttributeValue"] = None, + categories: List[Category] = None): self.part_number = _clean(part_number) self.mpn = _clean(mpn) self.url = url self.attributes = attributes or [] # type: List["DigikeyAttributeValue"] - self.categories = categories or [] + self.categories = categories or [] # type: List[Category] self.quantity_available = None self.description = None self.price_breaks: List[PriceBreak] = [] @@ -158,6 +160,10 @@ class DigikeyProduct(object): def __hash__(self): return self.part_number.__hash__() + @property + def uri(self): + return "https://digikey.com/pn#{}".format(self.part_number) + def attribute_by_id(self, _id): return next((a for a in self.attributes if a.attribute_type.id == _id), None) @@ -228,33 +234,6 @@ class DigikeyAttributeValue(object): assert self.attribute_type -@total_ordering -class DigikeyProductCategory(object): - def __init__(self, _id, label, digikey_url=None, parent=None): - self.id = _clean(_id) - self.label = _clean(label) - self.digikey_url = digikey_url if digikey_url is None or digikey_url.startswith("http") else \ - "https://www.digikey.com" + digikey_url - self.parent: DigikeyProductCategory = parent - self.subCategories: List[DigikeyProductCategory] = [] - - assert self.id - assert self.label - - def __eq__(self, other: "DigikeyProductCategory"): - return self.id == other.id - - def __lt__(self, other: "DigikeyProductCategory") -> bool: - return self.label < other.label - - def add_sub_category(self, _id, label, digikey_url): - sc = DigikeyProductCategory(_id, label, digikey_url=digikey_url, parent=self) - self.subCategories.append(sc) - - def find_sub_category_by_label(self, label): - return next((sc for sc in self.subCategories if sc.label == label), None) - - class SearchResponseTypes(enum.Enum): MANY = 1 # A product table was returned. SINGLE = 2 # A product page was returned @@ -413,6 +392,24 @@ class DigikeyParser(object): a_type = self.digikey.get_attribute_type(attribute_type_id, label) attributes.append(DigikeyAttributeValue(value, a_type)) + # links = tree.xpath("//table[@id='product-attribute-table']//*[class='attributes-td-categories-link']") + links = tree.xpath("//*[@class='attributes-td-categories-link']/a") + parent = None + for a in links: + name = _clean(a.text) + href = _clean(a.get("href")) + if not href: + continue + m = re.fullmatch(".*/([0-9]+)", href) + href = self.ensure_absolute_url(origin_url, href) + + if m and name and href: + category_id = m.group(1) + uri = "https://digikey.com/category#{}".format(category_id) + c = Category(uri, name, href, parent) + parent = c + categories.append(c) + if part_number and mpn: p = DigikeyProduct(part_number, mpn, url, attributes, categories) p.price_breaks = self._parse_price_breaks(tree) @@ -565,7 +562,7 @@ class DigikeyParser(object): return DigikeySearchResponse(1, SearchResponseTypes.NO_MATCHES) @staticmethod - def ensure_absolute_url(origin_url, url): + def ensure_absolute_url(origin_url, url: Optional[str]): if url is None: return diff --git a/src/ee/digikey/search_parts.py b/src/ee/digikey/search_parts.py index eace4e3..1f1184a 100644 --- a/src/ee/digikey/search_parts.py +++ b/src/ee/digikey/search_parts.py @@ -1,11 +1,11 @@ import functools from pathlib import Path -from typing import List, MutableSet, Mapping +from typing import List, MutableSet, Mapping, Set from ee import EeException from ee.db import ObjDb from ee.digikey import Digikey, DigikeyParser, DigikeyClient, SearchResponseTypes, DigikeyProduct, DigikeyStore -from ee.part import PartDb, load_db, save_db, Part +from ee.part import PartDb, load_db, save_db, Part, Category from ee.tools import mk_parents from ee.xml import types, uris from ee.xml.uris import make_digikey_fact_key @@ -71,6 +71,7 @@ class QueryEngine(object): out_parts.add_index("spn", lambda p: [pn.value for pn in p.get_spns()], multiple=True) self.out_parts = out_parts + self.categories: Set[Category] = set() def pn_search(self, pn): return self.pn_spn_search(pn, False) @@ -78,6 +79,9 @@ class QueryEngine(object): def spn_search(self, pn): return self.pn_spn_search(pn, True) + def _collect_categories(self, product: DigikeyProduct): + [self.categories.add(c) for c in product.categories] + def pn_spn_search(self, pn, is_spn): s = "Searching for '{}'".format(pn) print(s, file=self.log) @@ -92,7 +96,9 @@ class QueryEngine(object): if response.response_type == SearchResponseTypes.EXCEPTION: result = "exception" elif response.response_type == SearchResponseTypes.SINGLE: - out_part = resolved(self.store.url, response.products[0]) + product = response.products[0] + out_part = resolved(self.store.url, product) + self._collect_categories(product) out_pn = out_part.get_exactly_one_spn() if is_spn else out_part.get_exactly_one_mpn() out_pn = out_pn.valueProp @@ -129,7 +135,9 @@ class QueryEngine(object): page = self.client.get_for_product_url(part.url, part.part_number) response = self.parser.parse_string(self.client.baseurl, page) if response.response_type == SearchResponseTypes.SINGLE: - out_part = resolved(self.store.url, response.products[0]) + product = response.products[0] + out_part = resolved(self.store.url, product) + self._collect_categories(product) result = "found" else: print("Unable to narrow down the part, got {} new products. Giving up.".format( @@ -153,9 +161,10 @@ class QueryEngine(object): def resolved(supplier, p: DigikeyProduct) -> Part: # TODO: fix uri - xml = types.Part(uri="https://digikey.com/pn#{}".format(p.part_number), + xml = types.Part(uri=p.uri, supplier=supplier, description=p.description, + category=types.CategoryList(), links=types.LinkList(), facts=types.FactList(), references=types.ReferenceList()) @@ -164,6 +173,9 @@ def resolved(supplier, p: DigikeyProduct) -> Part: if p.url: part.get_links().append(types.Link(url=p.url, relation="canonical", media_type="text/html")) + if len(p.categories): + xml.set_category(p.categories[-1].uri) + for d in p.documents: title = "{}: {}".format(d.section, d.title) relations = ["http://purl.org/ee/link-relation#documentation"] @@ -277,4 +289,5 @@ def run_search_parts(in_path: Path, out_path: Path, log, cache_dir: Path, store_ part_db = PartDb() for part in engine.out_parts: part_db.add_entry(part, True) + part_db.categories = engine.categories save_db(out_path, part_db, sort=True) diff --git a/src/ee/part/__init__.py b/src/ee/part/__init__.py index 975dc68..0845b6c 100644 --- a/src/ee/part/__init__.py +++ b/src/ee/part/__init__.py @@ -1,3 +1,4 @@ +from functools import total_ordering from pathlib import Path from typing import List, Optional, Iterator, Union @@ -8,6 +9,7 @@ from ee.xml import types __all__ = [ "FactType", "EeValueFactType", + "Category", "Part", "PartDb", "load_db", @@ -179,6 +181,34 @@ class ReferenceList(object): self.description_references.append(description) +class Links: + def __init__(self, links: Optional[types.LinkList]): + self.links = links.linkProp if links else [] + + def by_rel(self, rel) -> Optional[types.Link]: + for l in self.links: + if l.relationProp == rel: + return l + + +@total_ordering +class Category: + def __init__(self, uri: str, name: str, href: Optional[str], parent: Optional["Category"]): + self.uri = uri + self.name = name + self.href = href + self.parent = parent + + def __eq__(self, other): + return self.uri == other.uri + + def __hash__(self): + return hash(self.uri) + + def __lt__(self, other): + return self.uri < other.uri + + # TODO: Replace self.xml.referencesProp with ReferenceList class Part(object): def __init__(self, xml: types.Part): @@ -412,6 +442,7 @@ class Assembly(object): class PartDb(object): def __init__(self): self.parts: List[Entry] = [] + self.categories: List[Category] = [] self.new_entries = 0 self._assembly: Optional[Assembly] = None @@ -457,6 +488,25 @@ def load_db(path: Path) -> PartDb: for p in part_db.partsProp.part: db.add_entry(p, False) + part_db.categoriesProp = part_db.categoriesProp or types.CategoryList() + categories_by_uri = {} + categories_by_parent = [] + xml: types.Category + for xml in part_db.categoriesProp.category: + links = Links(xml.linksProp) + + c = Category(xml.uri, xml.name, None, None) + categories_by_uri[c.uri] = c + + parent = links.by_rel("parent") + if parent: + categories_by_parent.append((c, parent.urlProp)) + + db.categories.append(c) + + for c, parent_uri in categories_by_parent: + c.parent = categories_by_uri[parent_uri] + return db @@ -475,8 +525,17 @@ def save_db(path: Path, db: PartDb, sort=False): p.clean_xml() part_db.parts.partProp.append(p.underlying) - if db.has_assembly: + if len(db.categories): + cs = types.CategoryList() + for c in db.categories: + links = None + if c.parent: + links = types.LinkList() + links.add_link(types.Link(url=c.parent.uri, relation="parent")) + cs.category.append(types.Category(uri=c.uri, name=c.name, links=links)) + part_db.set_categories(cs) + if db.has_assembly: def to_xml(ap: AssemblyPart): xml = types.AssemblyPart() if ap.count != 0: diff --git a/src/ee/tools/part_apply_souffle_post.py b/src/ee/tools/part_apply_souffle_post.py index 8630603..8ef7c6a 100644 --- a/src/ee/tools/part_apply_souffle_post.py +++ b/src/ee/tools/part_apply_souffle_post.py @@ -15,10 +15,12 @@ def work(in_path: Path, out_path: Path, work_dir: Path): in_parts: ObjDb[Part] = ObjDb[Part]() uri_idx = in_parts.add_unique_index("uri", lambda p: p.uri) - for entry in load_db(in_path).parts: + in_db = load_db(in_path) + for entry in in_db.parts: in_parts.add(Part(entry.part)) out_parts = PartDb() + out_parts.categories.extend(in_db.categories) for part in in_parts: out_parts.add_entry(part, False) diff --git a/src/ee/xml/types.py b/src/ee/xml/types.py index afc7e72..2a806e5 100644 --- a/src/ee/xml/types.py +++ b/src/ee/xml/types.py @@ -3,7 +3,7 @@ # # Generated by generateDS.py. -# Python 3.7.3 (default, Apr 3 2019, 05:39:12) [GCC 8.3.0] +# Python 3.7.5rc1 (default, Oct 8 2019, 16:47:45) [GCC 9.2.1 20190909] # # Command line options: # ('-f', '') @@ -822,11 +822,12 @@ def _cast(typ, value): class PartDb(GeneratedsSuper): subclass = None superclass = None - def __init__(self, parts=None, assembly=None, **kwargs_): + def __init__(self, parts=None, assembly=None, categories=None, **kwargs_): self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') self.parts = parts self.assembly = assembly + self.categories = categories def factory(*args_, **kwargs_): if CurrentSubclassModule_ is not None: subclass = getSubclassFromModule_( @@ -848,10 +849,16 @@ class PartDb(GeneratedsSuper): def set_assembly(self, assembly): self.assembly = assembly assemblyProp = property(get_assembly, set_assembly) + def get_categories(self): + return self.categories + def set_categories(self, categories): + self.categories = categories + categoriesProp = property(get_categories, set_categories) def hasContent_(self): if ( self.parts is not None or - self.assembly is not None + self.assembly is not None or + self.categories is not None ): return True else: @@ -888,6 +895,8 @@ class PartDb(GeneratedsSuper): self.parts.export(outfile, level, namespaceprefix_, namespacedef_='', name_='parts', pretty_print=pretty_print) if self.assembly is not None: self.assembly.export(outfile, level, namespaceprefix_, namespacedef_='', name_='assembly', pretty_print=pretty_print) + if self.categories is not None: + self.categories.export(outfile, level, namespaceprefix_, namespacedef_='', name_='categories', pretty_print=pretty_print) def build(self, node): already_processed = set() self.buildAttributes(node, node.attrib, already_processed) @@ -908,18 +917,24 @@ class PartDb(GeneratedsSuper): obj_.build(child_) self.assembly = obj_ obj_.original_tagname_ = 'assembly' + elif nodeName_ == 'categories': + obj_ = CategoryList.factory(parent_object_=self) + obj_.build(child_) + self.categories = obj_ + obj_.original_tagname_ = 'categories' # end class PartDb class Part(GeneratedsSuper): subclass = None superclass = None - def __init__(self, uri=None, supplier=None, description=None, links=None, references=None, facts=None, price_breaks=None, **kwargs_): + def __init__(self, uri=None, supplier=None, description=None, category=None, links=None, references=None, facts=None, price_breaks=None, **kwargs_): self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') self.uri = _cast(None, uri) self.supplier = supplier self.description = description + self.category = category self.links = links self.references = references self.facts = facts @@ -945,6 +960,11 @@ class Part(GeneratedsSuper): def set_description(self, description): self.description = description descriptionProp = property(get_description, set_description) + def get_category(self): + return self.category + def set_category(self, category): + self.category = category + categoryProp = property(get_category, set_category) def get_links(self): return self.links def set_links(self, links): @@ -974,6 +994,7 @@ class Part(GeneratedsSuper): if ( self.supplier is not None or self.description is not None or + self.category is not None or self.links is not None or self.references is not None or self.facts is not None or @@ -1018,6 +1039,9 @@ class Part(GeneratedsSuper): if self.description is not None: showIndent(outfile, level, pretty_print) outfile.write('<%sdescription>%s%s' % (namespaceprefix_ , self.gds_encode(self.gds_format_string(quote_xml(self.description), input_name='description')), namespaceprefix_ , eol_)) + if self.category is not None: + showIndent(outfile, level, pretty_print) + outfile.write('<%scategory>%s%s' % (namespaceprefix_ , self.gds_encode(self.gds_format_string(quote_xml(self.category), input_name='category')), namespaceprefix_ , eol_)) if self.links is not None: self.links.export(outfile, level, namespaceprefix_, namespacedef_='', name_='links', pretty_print=pretty_print) if self.references is not None: @@ -1049,6 +1073,11 @@ class Part(GeneratedsSuper): value_ = self.gds_parse_string(value_, node, 'description') value_ = self.gds_validate_string(value_, node, 'description') self.description = value_ + elif nodeName_ == 'category': + value_ = child_.text + value_ = self.gds_parse_string(value_, node, 'category') + value_ = self.gds_validate_string(value_, node, 'category') + self.category = value_ elif nodeName_ == 'links': obj_ = LinkList.factory(parent_object_=self) obj_.build(child_) @@ -1159,6 +1188,197 @@ class PartList(GeneratedsSuper): # end class PartList +class Category(GeneratedsSuper): + subclass = None + superclass = None + def __init__(self, uri=None, name=None, links=None, **kwargs_): + self.original_tagname_ = None + self.parent_object_ = kwargs_.get('parent_object_') + self.uri = _cast(None, uri) + self.name = name + self.links = links + def factory(*args_, **kwargs_): + if CurrentSubclassModule_ is not None: + subclass = getSubclassFromModule_( + CurrentSubclassModule_, Category) + if subclass is not None: + return subclass(*args_, **kwargs_) + if Category.subclass: + return Category.subclass(*args_, **kwargs_) + else: + return Category(*args_, **kwargs_) + factory = staticmethod(factory) + def get_name(self): + return self.name + def set_name(self, name): + self.name = name + nameProp = property(get_name, set_name) + def get_links(self): + return self.links + def set_links(self, links): + self.links = links + linksProp = property(get_links, set_links) + def get_uri(self): + return self.uri + def set_uri(self, uri): + self.uri = uri + uriProp = property(get_uri, set_uri) + def hasContent_(self): + if ( + self.name is not None or + self.links is not None + ): + return True + else: + return False + def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='Category', pretty_print=True): + imported_ns_def_ = GenerateDSNamespaceDefs_.get('Category') + if imported_ns_def_ is not None: + namespacedef_ = imported_ns_def_ + if pretty_print: + eol_ = '\n' + else: + eol_ = '' + if self.original_tagname_ is not None: + name_ = self.original_tagname_ + showIndent(outfile, level, pretty_print) + outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) + already_processed = set() + self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='Category') + if self.hasContent_(): + outfile.write('>%s' % (eol_, )) + self.exportChildren(outfile, level + 1, '', namespacedef_, name_='Category', pretty_print=pretty_print) + showIndent(outfile, level, pretty_print) + outfile.write('%s' % (namespaceprefix_, name_, eol_)) + else: + outfile.write('/>%s' % (eol_, )) + def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='Category'): + if self.uri is not None and 'uri' not in already_processed: + already_processed.add('uri') + outfile.write(' uri=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.uri), input_name='uri')), )) + def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='Category', fromsubclass_=False, pretty_print=True): + if pretty_print: + eol_ = '\n' + else: + eol_ = '' + if self.name is not None: + showIndent(outfile, level, pretty_print) + outfile.write('<%sname>%s%s' % (namespaceprefix_ , self.gds_encode(self.gds_format_string(quote_xml(self.name), input_name='name')), namespaceprefix_ , eol_)) + if self.links is not None: + self.links.export(outfile, level, namespaceprefix_, namespacedef_='', name_='links', pretty_print=pretty_print) + def build(self, node): + already_processed = set() + self.buildAttributes(node, node.attrib, already_processed) + for child in node: + nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] + self.buildChildren(child, node, nodeName_) + return self + def buildAttributes(self, node, attrs, already_processed): + value = find_attr_value_('uri', node) + if value is not None and 'uri' not in already_processed: + already_processed.add('uri') + self.uri = value + def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): + if nodeName_ == 'name': + value_ = child_.text + value_ = self.gds_parse_string(value_, node, 'name') + value_ = self.gds_validate_string(value_, node, 'name') + self.name = value_ + elif nodeName_ == 'links': + obj_ = LinkList.factory(parent_object_=self) + obj_.build(child_) + self.links = obj_ + obj_.original_tagname_ = 'links' +# end class Category + + +class CategoryList(GeneratedsSuper): + subclass = None + superclass = None + def __init__(self, category=None, **kwargs_): + self.original_tagname_ = None + self.parent_object_ = kwargs_.get('parent_object_') + if category is None: + self.category = [] + else: + self.category = category + def factory(*args_, **kwargs_): + if CurrentSubclassModule_ is not None: + subclass = getSubclassFromModule_( + CurrentSubclassModule_, CategoryList) + if subclass is not None: + return subclass(*args_, **kwargs_) + if CategoryList.subclass: + return CategoryList.subclass(*args_, **kwargs_) + else: + return CategoryList(*args_, **kwargs_) + factory = staticmethod(factory) + def get_category(self): + return self.category + def set_category(self, category): + self.category = category + def add_category(self, value): + self.category.append(value) + def insert_category_at(self, index, value): + self.category.insert(index, value) + def replace_category_at(self, index, value): + self.category[index] = value + categoryProp = property(get_category, set_category) + def hasContent_(self): + if ( + self.category + ): + return True + else: + return False + def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='CategoryList', pretty_print=True): + imported_ns_def_ = GenerateDSNamespaceDefs_.get('CategoryList') + if imported_ns_def_ is not None: + namespacedef_ = imported_ns_def_ + if pretty_print: + eol_ = '\n' + else: + eol_ = '' + if self.original_tagname_ is not None: + name_ = self.original_tagname_ + showIndent(outfile, level, pretty_print) + outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) + already_processed = set() + self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CategoryList') + if self.hasContent_(): + outfile.write('>%s' % (eol_, )) + self.exportChildren(outfile, level + 1, '', namespacedef_, name_='CategoryList', pretty_print=pretty_print) + showIndent(outfile, level, pretty_print) + outfile.write('%s' % (namespaceprefix_, name_, eol_)) + else: + outfile.write('/>%s' % (eol_, )) + def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='CategoryList'): + pass + def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='CategoryList', fromsubclass_=False, pretty_print=True): + if pretty_print: + eol_ = '\n' + else: + eol_ = '' + for category_ in self.category: + category_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='category', pretty_print=pretty_print) + def build(self, node): + already_processed = set() + self.buildAttributes(node, node.attrib, already_processed) + for child in node: + nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] + self.buildChildren(child, node, nodeName_) + return self + def buildAttributes(self, node, attrs, already_processed): + pass + def buildChildren(self, child_, node, nodeName_, fromsubclass_=False): + if nodeName_ == 'category': + obj_ = Category.factory(parent_object_=self) + obj_.build(child_) + self.category.append(obj_) + obj_.original_tagname_ = 'category' +# end class CategoryList + + class PartReference(GeneratedsSuper): subclass = None superclass = None @@ -2602,6 +2822,7 @@ class Assembly(GeneratedsSuper): GDSClassesMapping = { + 'categories': CategoryList, 'part': Part, 'part-db': PartDb, } @@ -2737,6 +2958,8 @@ __all__ = [ "Assembly", "AssemblyPart", "AssemblyPartList", + "Category", + "CategoryList", "Fact", "FactList", "Link", diff --git a/xsd/ee.xsd b/xsd/ee.xsd index 87f072d..8aad7d2 100644 --- a/xsd/ee.xsd +++ b/xsd/ee.xsd @@ -35,11 +35,13 @@ TODO: rename 'id' to 'url'. + + @@ -47,6 +49,7 @@ TODO: rename 'id' to 'url'. + @@ -61,6 +64,20 @@ TODO: rename 'id' to 'url'. + + + + + + + + + + + + + + -- cgit v1.2.3