X-Git-Url: http://www.git.cypherpunks.ru/?p=pyderasn.git;a=blobdiff_plain;f=pyderasn.py;h=7bb1799863c454bb22bdbdc70c75411c47635f04;hp=24204ce8780ca8813723e4d09d715186fe1d1201;hb=749705c0df79a03dfda53dc0f78044efba8590a6;hpb=e1249a0c754920c57e6d7682458f50fd65b70026 diff --git a/pyderasn.py b/pyderasn.py index 24204ce..7bb1799 100755 --- a/pyderasn.py +++ b/pyderasn.py @@ -232,6 +232,7 @@ Currently available context options: * :ref:`allow_unordered_set ` * :ref:`bered ` * :ref:`defines_by_path ` +* :ref:`evgen_mode_upto ` .. _pprinting: @@ -428,7 +429,7 @@ ______________________________ Sometimes you either can not or do not want to explicitly set *defines* in the schema. You can dynamically apply those definitions when calling -``.decode()`` method. +:py:meth:`pyderasn.Obj.decode` method. Specify ``defines_by_path`` key in the :ref:`decode context `. Its value must be sequence of following tuples:: @@ -492,9 +493,9 @@ useful for SEQUENCE/SET OF-s. BER encoding ------------ -By default PyDERASN accepts only DER encoded data. It always encodes to -DER. But you can optionally enable BER decoding with setting ``bered`` -:ref:`context ` argument to True. Indefinite lengths and +By default PyDERASN accepts only DER encoded data. By default it encodes +to DER. But you can optionally enable BER decoding with setting +``bered`` :ref:`context ` argument to True. Indefinite lengths and constructed primitive types should be parsed successfully. * If object is encoded in BER form (not the DER one), then ``ber_encoded`` @@ -533,6 +534,314 @@ lengths will be invalid in that case. This option should be used only for skipping some decode errors, just to see the decoded structure somehow. +.. _streaming: + +Streaming and dealing with huge structures +------------------------------------------ + +.. _evgen_mode: + +evgen mode +__________ + +ASN.1 structures can be huge, they can hold millions of objects inside +(for example Certificate Revocation Lists (CRL), holding revocation +state for every previously issued X.509 certificate). CACert.org's 8 MiB +CRL file takes more than half a gigabyte of memory to hold the decoded +structure. + +If you just simply want to check the signature over the ``tbsCertList``, +you can create specialized schema with that field represented as +OctetString for example:: + + class TBSCertListFast(Sequence): + schema = ( + [...] + ("revokedCertificates", OctetString( + impl=SequenceOf.tag_default, + optional=True, + )), + [...] + ) + +This allows you to quickly decode a few fields and check the signature +over the ``tbsCertList`` bytes. + +But how can you get all certificate's serial number from it, after you +trust that CRL after signature validation? You can use so called +``evgen`` (event generation) mode, to catch the events/facts of some +successful object decoding. Let's use command line capabilities:: + + $ python -m pyderasn --schema tests.test_crl:CertificateList --evgen revoke.crl + 10 [1,1, 1] . . version: Version INTEGER v2 (01) OPTIONAL + 15 [1,1, 9] . . . algorithm: OBJECT IDENTIFIER 1.2.840.113549.1.1.13 + 26 [0,0, 2] . . . parameters: [UNIV 5] ANY OPTIONAL + 13 [1,1, 13] . . signature: AlgorithmIdentifier SEQUENCE + 34 [1,1, 3] . . . . . . type: AttributeType OBJECT IDENTIFIER 2.5.4.10 + 39 [0,0, 9] . . . . . . value: [UNIV 19] AttributeValue ANY + 32 [1,1, 14] . . . . . 0: AttributeTypeAndValue SEQUENCE + 30 [1,1, 16] . . . . 0: RelativeDistinguishedName SET OF + [...] + 188 [1,1, 1] . . . . userCertificate: CertificateSerialNumber INTEGER 17 (11) + 191 [1,1, 13] . . . . . utcTime: UTCTime UTCTime 2003-04-01T14:25:08 + 191 [0,0, 15] . . . . revocationDate: Time CHOICE utcTime + 191 [1,1, 13] . . . . . utcTime: UTCTime UTCTime 2003-04-01T14:25:08 + 186 [1,1, 18] . . . 0: RevokedCertificate SEQUENCE + 208 [1,1, 1] . . . . userCertificate: CertificateSerialNumber INTEGER 20 (14) + 211 [1,1, 13] . . . . . utcTime: UTCTime UTCTime 2002-10-01T02:18:01 + 211 [0,0, 15] . . . . revocationDate: Time CHOICE utcTime + 211 [1,1, 13] . . . . . utcTime: UTCTime UTCTime 2002-10-01T02:18:01 + 206 [1,1, 18] . . . 1: RevokedCertificate SEQUENCE + [...] + 9144992 [0,0, 15] . . . . revocationDate: Time CHOICE utcTime + 9144992 [1,1, 13] . . . . . utcTime: UTCTime UTCTime 2020-02-08T07:25:06 + 9144985 [1,1, 20] . . . 415755: RevokedCertificate SEQUENCE + 181 [1,4,9144821] . . revokedCertificates: RevokedCertificates SEQUENCE OF OPTIONAL + 5 [1,4,9144997] . tbsCertList: TBSCertList SEQUENCE + 9145009 [1,1, 9] . . algorithm: OBJECT IDENTIFIER 1.2.840.113549.1.1.13 + 9145020 [0,0, 2] . . parameters: [UNIV 5] ANY OPTIONAL + 9145007 [1,1, 13] . signatureAlgorithm: AlgorithmIdentifier SEQUENCE + 9145022 [1,3, 513] . signatureValue: BIT STRING 4096 bits + 0 [1,4,9145534] CertificateList SEQUENCE + +Here we see how decoder works: it decodes SEQUENCE's tag, length, then +decodes underlying values. It can not tell if SEQUENCE is decoded, so +the event of the upper level SEQUENCE is the last one we see. +``version`` field is just a single INTEGER -- it is decoded and event is +fired immediately. Then we see that ``algorithm`` and ``parameters`` +fields are decoded and only after them the ``signature`` SEQUENCE is +fired as a successfully decoded. There are 4 events for each revoked +certificate entry in that CRL: ``userCertificate`` serial number, +``utcTime`` of ``revocationDate`` CHOICE, ``RevokedCertificate`` itself +as a one of entity in ``revokedCertificates`` SEQUENCE OF. + +We can do that in our ordinary Python code and understand where we are +by looking at deterministically generated decode paths (do not forget +about useful ``--print-decode-path`` CLI option). We must use +:py:meth:`pyderasn.Obj.decode_evgen` method, instead of ordinary +:py:meth:`pyderasn.Obj.decode`. It is generator yielding ``(decode_path, +obj, tail)`` tuples:: + + for decode_path, obj, _ in CertificateList().decode_evgen(crl_raw): + if ( + len(decode_path) == 4 and + decode_path[:2] == ("tbsCertList", "revokedCertificates"), + decode_path[3] == "userCertificate" + ): + print("serial number:", int(obj)) + +Virtually it does not take any memory except at least needed for single +object storage. You can easily use that mode to determine required +object ``.offset`` and ``.*len`` to be able to decode it separately, or +maybe verify signature upon it just by taking bytes by ``.offset`` and +``.tlvlen``. + +.. _evgen_mode_upto_ctx: + +evgen_mode_upto +_______________ + +There is full ability to get any kind of data from the CRL in the +example above. However it is not too convenient to get the whole +``RevokedCertificate`` structure, that is pretty lightweight and one may +do not want to disassemble it. You can use ``evgen_mode_upto`` +:ref:`ctx ` option that semantically equals to +:ref:`defines_by_path ` -- list of decode paths +mapped to any non-None value. If specified decode path is met, then any +subsequent objects won't be decoded in evgen mode. That allows us to +parse the CRL above with fully assembled ``RevokedCertificate``:: + + for decode_path, obj, _ in CertificateList().decode_evgen( + crl_raw, + ctx={"evgen_mode_upto": ( + (("tbsCertList", "revokedCertificates", any), True), + )}, + ): + if ( + len(decode_path) == 3 and + decode_path[:2] == ("tbsCertList", "revokedCertificates"), + ): + print("serial number:", int(obj["userCertificate"])) + +.. _mmap: + +mmap-ed file +____________ + +POSIX compliant systems have ``mmap`` syscall, giving ability to work +the memory mapped file. You can deal with the file like it was an +ordinary binary string, allowing you not to load it to the memory first. +Also you can use them as an input for OCTET STRING, taking no Python +memory for their storage. + +There is convenient :py:func:`pyderasn.file_mmaped` function that +creates read-only memoryview on the file contents:: + + with open("huge", "rb") as fd: + raw = file_mmaped(fd) + obj = Something.decode(raw) + +.. warning:: + + mmap-ed files in Python2.7 does not implement buffer protocol, so + memoryview won't work on them. + +.. warning:: + + mmap maps the **whole** file. So it plays no role if you seek-ed it + before. Take the slice of the resulting memoryview with required + offset instead. + +.. note:: + + If you use ZFS as underlying storage, then pay attention that + currently most platforms does not deal good with ZFS ARC and ordinary + page cache used for mmaps. It can take twice the necessary size in + the memory: both in page cache and ZFS ARC. + +CER encoding +____________ + +We can parse any kind of data now, but how can we produce files +streamingly, without storing their encoded representation in memory? +SEQUENCE by default encodes in memory all its values, joins them in huge +binary string, just to know the exact size of SEQUENCE's value for +encoding it in TLV. DER requires you to know all exact sizes of the +objects. + +You can use CER encoding mode, that slightly differs from the DER, but +does not require exact sizes knowledge, allowing streaming encoding +directly to some writer/buffer. Just use +:py:meth:`pyderasn.Obj.encode_cer` method, providing the writer where +encoded data will flow:: + + opener = io.open if PY2 else open + with opener("result", "wb") as fd: + obj.encode_cer(fd.write) + +:: + + buf = io.BytesIO() + obj.encode_cer(buf.write) + +If you do not want to create in-memory buffer every time, then you can +use :py:func:`pyderasn.encode_cer` function:: + + data = encode_cer(obj) + +Remember that CER is **not valid** DER in most cases, so you **have to** +use :ref:`bered ` :ref:`ctx ` option during its +decoding. Also currently there is **no** validation that provided CER is +valid one -- you are sure that it has only valid BER encoding. + +.. warning:: + + SET OF values can not be streamingly encoded, because they are + required to be sorted byte-by-byte. Big SET OF values still will take + much memory. Use neither SET nor SET OF values, as modern ASN.1 + also recommends too. + +Do not forget about using :ref:`mmap-ed ` memoryviews for your +OCTET STRINGs! They will be streamingly copied from underlying file to +the buffer using 1 KB chunks. + +Some structures require that some of the elements have to be forcefully +DER encoded. For example ``SignedData`` CMS requires you to encode +``SignedAttributes`` and X.509 certificates in DER form, allowing you to +encode everything else in BER. You can tell any of the structures to be +forcefully encoded in DER during CER encoding, by specifying +``der_forced=True`` attribute:: + + class Certificate(Sequence): + schema = (...) + der_forced = True + + class SignedAttributes(SetOf): + schema = Attribute() + bounds = (1, 32) + der_forced = True + +agg_octet_string +________________ + +In most cases, huge quantity of binary data is stored as OCTET STRING. +CER encoding splits it on 1 KB chunks. BER allows splitting on various +levels of chunks inclusion:: + + SOME STRING[CONSTRUCTED] + OCTET STRING[CONSTRUCTED] + OCTET STRING[PRIMITIVE] + DATA CHUNK + OCTET STRING[PRIMITIVE] + DATA CHUNK + OCTET STRING[PRIMITIVE] + DATA CHUNK + OCTET STRING[PRIMITIVE] + DATA CHUNK + OCTET STRING[CONSTRUCTED] + OCTET STRING[PRIMITIVE] + DATA CHUNK + OCTET STRING[PRIMITIVE] + DATA CHUNK + OCTET STRING[CONSTRUCTED] + OCTET STRING[CONSTRUCTED] + OCTET STRING[PRIMITIVE] + DATA CHUNK + +You can not just take the offset and some ``.vlen`` of the STRING and +treat it as the payload. If you decode it without +:ref:`evgen mode `, then it will be automatically aggregated +and ``bytes()`` will give the whole payload contents. + +You are forced to use :ref:`evgen mode ` for decoding for +small memory footprint. There is convenient +:py:func:`pyderasn.agg_octet_string` helper for reconstructing the +payload. Let's assume you have got BER/CER encoded ``ContentInfo`` with +huge ``SignedData`` and ``EncapsulatedContentInfo``. Let's calculate the +SHA512 digest of its ``eContent``:: + + fd = open("data.p7m", "rb") + raw = file_mmaped(fd) + ctx = {"bered": True} + for decode_path, obj, _ in ContentInfo().decode_evgen(raw, ctx=ctx): + if decode_path == ("content",): + content = obj + break + else: + raise ValueError("no content found") + hasher_state = sha512() + def hasher(data): + hasher_state.update(data) + return len(data) + evgens = SignedData().decode_evgen( + raw[content.offset:], + offset=content.offset, + ctx=ctx, + ) + agg_octet_string(evgens, ("encapContentInfo", "eContent"), raw, hasher) + fd.close() + digest = hasher_state.digest() + +Simply replace ``hasher`` with some writeable file's ``fd.write`` to +copy the payload (without BER/CER encoding interleaved overhead) in it. +Virtually it won't take memory more than for keeping small structures +and 1 KB binary chunks. + +SEQUENCE OF iterators +_____________________ + +You can use iterators as a value in :py:class:`pyderasn.SequenceOf` +classes. The only difference with providing the full list of objects, is +that type and bounds checking is done during encoding process. Also +sequence's value will be emptied after encoding, forcing you to set its +value again. + +This is very useful when you have to create some huge objects, like +CRLs, with thousands and millions of entities inside. You can write the +generator taking necessary data from the database and giving the +``RevokedCertificate`` objects. Only binary representation of that +objects will take memory during DER encoding. + Base Obj -------- .. autoclass:: pyderasn.Obj @@ -642,7 +951,10 @@ Various ------- .. autofunction:: pyderasn.abs_decode_path +.. autofunction:: pyderasn.agg_octet_string .. autofunction:: pyderasn.colonize_hex +.. autofunction:: pyderasn.encode_cer +.. autofunction:: pyderasn.file_mmaped .. autofunction:: pyderasn.hexenc .. autofunction:: pyderasn.hexdec .. autofunction:: pyderasn.tag_encode @@ -772,6 +1084,7 @@ Now you can print only the specified tree, for example signature algorithm:: . . 05:00 """ +from array import array from codecs import getdecoder from codecs import getencoder from collections import namedtuple @@ -779,7 +1092,10 @@ from collections import OrderedDict from copy import copy from datetime import datetime from datetime import timedelta +from io import BytesIO from math import ceil +from mmap import mmap +from mmap import PROT_READ from operator import attrgetter from string import ascii_letters from string import digits @@ -811,6 +1127,7 @@ except ImportError: # pragma: no cover __version__ = "7.0" __all__ = ( + "agg_octet_string", "Any", "BitString", "BMPString", @@ -819,8 +1136,10 @@ __all__ = ( "Choice", "DecodeError", "DecodePathDefBy", + "encode_cer", "Enumerated", "ExceedingData", + "file_mmaped", "GeneralizedTime", "GeneralString", "GraphicString", @@ -886,7 +1205,16 @@ NAMEDTUPLE_KWARGS = {} if version_info < (3, 6) else {"module": __name__} SET01 = frozenset("01") DECIMALS = frozenset(digits) DECIMAL_SIGNS = ".," +NEXT_ATTR_NAME = "next" if PY2 else "__next__" + + +def file_mmaped(fd): + """Make mmap-ed memoryview for reading from file + :param fd: file object + :returns: memoryview over read-only mmap-ing of the whole file + """ + return memoryview(mmap(fd.fileno(), 0, prot=PROT_READ)) def pureint(value): if not set(value) <= DECIMALS: @@ -1190,6 +1518,27 @@ def len_decode(data): return l, 1 + octets_num, data[1 + octets_num:] +LEN1K = len_encode(1000) + + +def write_full(writer, data): + """Fully write provided data + + :param writer: must comply with ``io.RawIOBase.write`` behaviour + + BytesIO does not guarantee that the whole data will be written at + once. That function write everything provided, raising an error if + ``writer`` returns None. + """ + data = memoryview(data) + written = 0 + while written != len(data): + n = writer(data[written:]) + if n is None: + raise ValueError("can not write to buf") + written += n + + ######################################################################## # Base class ######################################################################## @@ -1314,6 +1663,10 @@ class Obj(object): """ return self._tag_order + @property + def tag_order_cer(self): + return self.tag_order + @property def tlen(self): """See :ref:`decoding` @@ -1348,7 +1701,7 @@ class Obj(object): yield NotImplemented def encode(self): - """Encode the structure + """DER encode the structure :returns: DER representation """ @@ -1357,6 +1710,26 @@ class Obj(object): return raw return b"".join((self._expl, len_encode(len(raw)), raw)) + def encode_cer(self, writer): + """CER encode the structure to specified writer + + :param writer: must comply with ``io.RawIOBase.write`` + behaviour. It takes slice to be written and + returns number of bytes processed. If it returns + None, then exception will be raised + """ + if self._expl is not None: + write_full(writer, self._expl + LENINDEF) + if getattr(self, "der_forced", False): + write_full(writer, self._encode()) + else: + self._encode_cer(writer) + if self._expl is not None: + write_full(writer, EOC) + + def _encode_cer(self, writer): + write_full(writer, self._encode()) + def hexencode(self): """Do hexadecimal encoded :py:meth:`pyderasn.Obj.encode` """ @@ -1372,6 +1745,26 @@ class Obj(object): tag_only=False, _ctx_immutable=True, ): + """Decode the data + + :param data: either binary or memoryview + :param int offset: initial data's offset + :param bool leavemm: do we need to leave memoryview of remaining + data as is, or convert it to bytes otherwise + :param decode_path: current decode path (tuples of strings, + possibly with DecodePathDefBy) with will be + the root for all underlying objects + :param ctx: optional :ref:`context ` governing decoding process + :param bool tag_only: decode only the tag, without length and + contents (used only in Choice and Set + structures, trying to determine if tag satisfies + the schema) + :param bool _ctx_immutable: do we need to ``copy.copy()`` ``ctx`` + before using it? + :returns: (Obj, remaining data) + + .. seealso:: :ref:`decoding` + """ result = next(self.decode_evgen( data, offset, @@ -1398,21 +1791,11 @@ class Obj(object): _ctx_immutable=True, _evgen_mode=True, ): - """Decode the data - - :param data: either binary or memoryview - :param int offset: initial data's offset - :param bool leavemm: do we need to leave memoryview of remaining - data as is, or convert it to bytes otherwise - :param ctx: optional :ref:`context ` governing decoding process - :param tag_only: decode only the tag, without length and contents - (used only in Choice and Set structures, trying to - determine if tag satisfies the schema) - :param _ctx_immutable: do we need to ``copy.copy()`` ``ctx`` - before using it? - :returns: (Obj, remaining data) + """Decode with evgen mode on - .. seealso:: :ref:`decoding` + That method is identical to :py:meth:`pyderasn.Obj.decode`, but + it returns the generator producing ``(decode_path, obj, tail)`` + values. See :ref:`evgen mode `. """ if ctx is None: ctx = {} @@ -1648,6 +2031,16 @@ class Obj(object): ) +def encode_cer(obj): + """Encode to CER in memory buffer + + :returns bytes: memory buffer contents + """ + buf = BytesIO() + obj.encode_cer(buf.write) + return buf.getvalue() + + class DecodePathDefBy(object): """DEFINED BY representation inside decode path """ @@ -1886,6 +2279,7 @@ def pprint( with_colours=False, with_decode_path=False, decode_path_only=(), + decode_path=(), ): """Pretty print object @@ -1938,7 +2332,7 @@ def pprint( else: for row in _pprint_pps(pp): yield row - return "\n".join(_pprint_pps(obj.pps())) + return "\n".join(_pprint_pps(obj.pps(decode_path))) ######################################################################## @@ -2784,6 +3178,30 @@ class BitString(Obj): octets, )) + def _encode_cer(self, writer): + bit_len, octets = self._value + if len(octets) + 1 <= 1000: + write_full(writer, self._encode()) + return + write_full(writer, self.tag_constructed) + write_full(writer, LENINDEF) + for offset in six_xrange(0, (len(octets) // 999) * 999, 999): + write_full(writer, b"".join(( + BitString.tag_default, + LEN1K, + int2byte(0), + octets[offset:offset + 999], + ))) + tail = octets[offset+999:] + if len(tail) > 0: + tail = int2byte((8 - bit_len % 8) % 8) + tail + write_full(writer, b"".join(( + BitString.tag_default, + len_encode(len(tail)), + tail, + ))) + write_full(writer, EOC) + def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode): try: t, tlen, lv = tag_strip(tlv) @@ -3063,13 +3481,10 @@ class OctetString(Obj): >>> OctetString(b"hell", bounds=(4, 4)) OCTET STRING 4 bytes 68656c6c - .. note:: - - Pay attention that OCTET STRING can be encoded both in primitive - and constructed forms. Decoder always checks constructed form tag - additionally to specified primitive one. If BER decoding is - :ref:`not enabled `, then decoder will fail, because - of DER restrictions. + Memoryviews can be used as a values. If memoryview is made on + mmap-ed file, then it does not take storage inside OctetString + itself. In CER encoding mode it will be streamed to the specified + writer, copying 1 KB chunks. """ __slots__ = ("tag_constructed", "_bound_min", "_bound_max", "defined") tag_default = tag_encode(4) @@ -3124,12 +3539,12 @@ class OctetString(Obj): ) def _value_sanitize(self, value): - if value.__class__ == binary_type: + if value.__class__ == binary_type or value.__class__ == memoryview: pass elif issubclass(value.__class__, OctetString): value = value._value else: - raise InvalidValueType((self.__class__, bytes)) + raise InvalidValueType((self.__class__, bytes, memoryview)) if not self._bound_min <= len(value) <= self._bound_max: raise BoundsError(self._bound_min, len(value), self._bound_max) return value @@ -3169,7 +3584,7 @@ class OctetString(Obj): def __bytes__(self): self._assert_ready() - return self._value + return bytes(self._value) def __eq__(self, their): if their.__class__ == binary_type: @@ -3214,6 +3629,28 @@ class OctetString(Obj): self._value, )) + def _encode_cer(self, writer): + octets = self._value + if len(octets) <= 1000: + write_full(writer, self._encode()) + return + write_full(writer, self.tag_constructed) + write_full(writer, LENINDEF) + for offset in six_xrange(0, (len(octets) // 1000) * 1000, 1000): + write_full(writer, b"".join(( + OctetString.tag_default, + LEN1K, + octets[offset:offset + 1000], + ))) + tail = octets[offset+1000:] + if len(tail) > 0: + write_full(writer, b"".join(( + OctetString.tag_default, + len_encode(len(tail)), + tail, + ))) + write_full(writer, EOC) + def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode): try: t, tlen, lv = tag_strip(tlv) @@ -3450,6 +3887,31 @@ class OctetString(Obj): yield pp +def agg_octet_string(evgens, decode_path, raw, writer): + """Aggregate constructed string (OctetString and its derivatives) + + :param evgens: iterator of generated events + :param decode_path: points to the string we want to decode + :param raw: slicebable (memoryview, bytearray, etc) with + the data evgens are generated on + :param writer: buffer.write where string is going to be saved + :param writer: where string is going to be saved. Must comply + with ``io.RawIOBase.write`` behaviour + """ + decode_path_len = len(decode_path) + for dp, obj, _ in evgens: + if dp[:decode_path_len] != decode_path: + continue + if not obj.ber_encoded: + write_full(writer, raw[ + obj.offset + obj.tlen + obj.llen: + obj.offset + obj.tlen + obj.llen + obj.vlen - + (EOC_LEN if obj.expl_lenindef else 0) + ]) + if len(dp) == decode_path_len: + break + + NullState = namedtuple("NullState", BasicState._fields, **NAMEDTUPLE_KWARGS) @@ -3669,7 +4131,7 @@ class ObjectIdentifier(Obj): def __add__(self, their): if their.__class__ == tuple: - return self.__class__(self._value + their) + return self.__class__(self._value + array("L", their)) if isinstance(their, self.__class__): return self.__class__(self._value + their._value) raise InvalidValueType((self.__class__, tuple)) @@ -3679,10 +4141,15 @@ class ObjectIdentifier(Obj): return value._value if isinstance(value, string_types): try: - value = tuple(pureint(arc) for arc in value.split(".")) + value = array("L", (pureint(arc) for arc in value.split("."))) except ValueError: raise InvalidOID("unacceptable arcs values") if value.__class__ == tuple: + try: + value = array("L", value) + except OverflowError as err: + raise InvalidOID(repr(err)) + if value.__class__ is array: if len(value) < 2: raise InvalidOID("less than 2 arcs") first_arc = value[0] @@ -3742,7 +4209,7 @@ class ObjectIdentifier(Obj): def __eq__(self, their): if their.__class__ == tuple: - return self._value == their + return self._value == array("L", their) if not issubclass(their.__class__, ObjectIdentifier): return False return ( @@ -3834,7 +4301,7 @@ class ObjectIdentifier(Obj): offset=offset, ) v, tail = v[:l], v[l:] - arcs = [] + arcs = array("L") ber_encoded = False while len(v) > 0: i = 0 @@ -3845,10 +4312,23 @@ class ObjectIdentifier(Obj): if ctx.get("bered", False): ber_encoded = True else: - raise DecodeError("non normalized arc encoding") + raise DecodeError( + "non normalized arc encoding", + klass=self.__class__, + decode_path=decode_path, + offset=offset, + ) arc = (arc << 7) | (octet & 0x7F) if octet & 0x80 == 0: - arcs.append(arc) + try: + arcs.append(arc) + except OverflowError: + raise DecodeError( + "too huge value for local unsigned long", + klass=self.__class__, + decode_path=decode_path, + offset=offset, + ) v = v[i + 1:] break i += 1 @@ -3870,7 +4350,7 @@ class ObjectIdentifier(Obj): first_arc = 2 second_arc -= 80 obj = self.__class__( - value=tuple([first_arc, second_arc] + arcs[1:]), + value=array("L", (first_arc, second_arc)) + arcs[1:], impl=self.tag, expl=self._expl, default=self.default, @@ -4534,6 +5014,9 @@ class UTCTime(VisibleString): value = self._encode_time() return b"".join((self.tag, len_encode(len(value)), value)) + def _encode_cer(self, writer): + write_full(writer, self._encode()) + def todatetime(self): return self._value @@ -4905,6 +5388,10 @@ class Choice(Obj): self._assert_ready() return self._value[1].tag_order if self._tag_order is None else self._tag_order + @property + def tag_order_cer(self): + return min(v.tag_order_cer for v in itervalues(self.specs)) + def __getitem__(self, key): if key not in self.specs: raise ObjUnknown(key) @@ -4935,6 +5422,10 @@ class Choice(Obj): self._assert_ready() return self._value[1].encode() + def _encode_cer(self, writer): + self._assert_ready() + self._value[1].encode_cer(writer) + def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode): for choice, spec in iteritems(self.specs): sub_decode_path = decode_path + (choice,) @@ -5066,7 +5557,7 @@ class Any(Obj): """``ANY`` special type >>> Any(Integer(-123)) - ANY 020185 + ANY INTEGER -123 (0X:7B) >>> a = Any(OctetString(b"hello world").encode()) ANY 040b68656c6c6f20776f726c64 >>> hexenc(bytes(a)) @@ -5114,9 +5605,9 @@ class Any(Obj): return value if isinstance(value, self.__class__): return value._value - if isinstance(value, Obj): - return value.encode() - raise InvalidValueType((self.__class__, Obj, binary_type)) + if not isinstance(value, Obj): + raise InvalidValueType((self.__class__, Obj, binary_type)) + return value @property def ready(self): @@ -5160,9 +5651,13 @@ class Any(Obj): def __eq__(self, their): if their.__class__ == binary_type: - return self._value == their + if self._value.__class__ == binary_type: + return self._value == their + return self._value.encode() == their if issubclass(their.__class__, Any): - return self._value == their._value + if self.ready and their.ready: + return bytes(self) == bytes(their) + return self.ready == their.ready return False def __call__( @@ -5179,7 +5674,10 @@ class Any(Obj): def __bytes__(self): self._assert_ready() - return self._value + value = self._value + if value.__class__ == binary_type: + return value + return self._value.encode() @property def tlen(self): @@ -5187,7 +5685,18 @@ class Any(Obj): def _encode(self): self._assert_ready() - return self._value + value = self._value + if value.__class__ == binary_type: + return value + return value.encode() + + def _encode_cer(self, writer): + self._assert_ready() + value = self._value + if value.__class__ == binary_type: + write_full(writer, value) + else: + value.encode_cer(writer) def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode): try: @@ -5264,12 +5773,20 @@ class Any(Obj): return pp_console_row(next(self.pps())) def pps(self, decode_path=()): + value = self._value + if value is None: + pass + elif value.__class__ == binary_type: + value = None + else: + value = repr(value) yield _pp( obj=self, asn1_type_name=self.asn1_type_name, obj_name=self.__class__.__name__, decode_path=decode_path, - blob=self._value if self.ready else None, + value=value, + blob=self._value if self._value.__class__ == binary_type else None, optional=self.optional, default=self == self.default, impl=None if self.tag == self.tag_default else tag_decode(self.tag), @@ -5591,6 +6108,12 @@ class Sequence(Obj): v = b"".join(v.encode() for v in self._values_for_encoding()) return b"".join((self.tag, len_encode(len(v)), v)) + def _encode_cer(self, writer): + write_full(writer, self.tag + LENINDEF) + for v in self._values_for_encoding(): + v.encode_cer(writer) + write_full(writer, EOC) + def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode): try: t, tlen, lv = tag_strip(tlv) @@ -5853,6 +6376,15 @@ class Set(Sequence): )) return b"".join((self.tag, len_encode(len(v)), v)) + def _encode_cer(self, writer): + write_full(writer, self.tag + LENINDEF) + for v in sorted( + self._values_for_encoding(), + key=attrgetter("tag_order_cer"), + ): + v.encode_cer(writer) + write_full(writer, EOC) + def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode): try: t, tlen, lv = tag_strip(tlv) @@ -6049,9 +6581,21 @@ class SequenceOf(Obj): >>> ints Ints SEQUENCE OF[INTEGER 123, INTEGER 345] - Also you can initialize sequence with preinitialized values: + You can initialize sequence with preinitialized values: >>> ints = Ints([Integer(123), Integer(234)]) + + Also you can use iterator as a value: + + >>> ints = Ints(iter(Integer(i) for i in range(1000000))) + + And it won't be iterated until encoding process. Pay attention that + bounds and required schema checks are done only during the encoding + process in that case! After encode was called, then value is zeroed + back to empty list and you have to set it again. That mode is useful + mainly with CER encoding mode, where all objects from the iterable + will be streamed to the buffer, without copying all of them to + memory first. """ __slots__ = ("spec", "_bound_min", "_bound_max") tag_default = tag_encode(form=TagFormConstructed, num=16) @@ -6095,21 +6639,31 @@ class SequenceOf(Obj): self._value = copy(default_obj._value) def _value_sanitize(self, value): + iterator = False if issubclass(value.__class__, SequenceOf): value = value._value + elif hasattr(value, NEXT_ATTR_NAME): + iterator = True + value = value elif hasattr(value, "__iter__"): value = list(value) else: - raise InvalidValueType((self.__class__, iter)) - if not self._bound_min <= len(value) <= self._bound_max: - raise BoundsError(self._bound_min, len(value), self._bound_max) - for v in value: - if not isinstance(v, self.spec.__class__): - raise InvalidValueType((self.spec.__class__,)) + raise InvalidValueType((self.__class__, iter, "iterator")) + if not iterator: + if not self._bound_min <= len(value) <= self._bound_max: + raise BoundsError(self._bound_min, len(value), self._bound_max) + class_expected = self.spec.__class__ + for v in value: + if not isinstance(v, class_expected): + raise InvalidValueType((class_expected,)) return value @property def ready(self): + if hasattr(self._value, NEXT_ATTR_NAME): + return True + if self._bound_min > 0 and len(self._value) == 0: + return False return all(v.ready for v in self._value) @property @@ -6119,6 +6673,8 @@ class SequenceOf(Obj): return any(v.bered for v in self._value) def __getstate__(self): + if hasattr(self._value, NEXT_ATTR_NAME): + raise ValueError("can not pickle SequenceOf with iterator") return SequenceOfState( __version__, self.tag, @@ -6194,11 +6750,9 @@ class SequenceOf(Obj): self._value.append(value) def __iter__(self): - self._assert_ready() return iter(self._value) def __len__(self): - self._assert_ready() return len(self._value) def __setitem__(self, key, value): @@ -6213,8 +6767,43 @@ class SequenceOf(Obj): return iter(self._value) def _encode(self): - v = b"".join(v.encode() for v in self._values_for_encoding()) - return b"".join((self.tag, len_encode(len(v)), v)) + iterator = hasattr(self._value, NEXT_ATTR_NAME) + if iterator: + values = [] + values_append = values.append + class_expected = self.spec.__class__ + values_for_encoding = self._values_for_encoding() + self._value = [] + for v in values_for_encoding: + if not isinstance(v, class_expected): + raise InvalidValueType((class_expected,)) + values_append(v.encode()) + if not self._bound_min <= len(values) <= self._bound_max: + raise BoundsError(self._bound_min, len(values), self._bound_max) + value = b"".join(values) + else: + value = b"".join(v.encode() for v in self._values_for_encoding()) + return b"".join((self.tag, len_encode(len(value)), value)) + + def _encode_cer(self, writer): + write_full(writer, self.tag + LENINDEF) + iterator = hasattr(self._value, NEXT_ATTR_NAME) + if iterator: + class_expected = self.spec.__class__ + values_count = 0 + values_for_encoding = self._values_for_encoding() + self._value = [] + for v in values_for_encoding: + if not isinstance(v, class_expected): + raise InvalidValueType((class_expected,)) + v.encode_cer(writer) + values_count += 1 + if not self._bound_min <= values_count <= self._bound_max: + raise BoundsError(self._bound_min, values_count, self._bound_max) + else: + for v in self._values_for_encoding(): + v.encode_cer(writer) + write_full(writer, EOC) def _decode( self, @@ -6407,10 +6996,24 @@ class SetOf(SequenceOf): tag_default = tag_encode(form=TagFormConstructed, num=17) asn1_type_name = "SET OF" + def _value_sanitize(self, value): + value = super(SetOf, self)._value_sanitize(value) + if hasattr(value, NEXT_ATTR_NAME): + raise ValueError( + "SetOf does not support iterator values, as no sense in them" + ) + return value + def _encode(self): v = b"".join(sorted(v.encode() for v in self._values_for_encoding())) return b"".join((self.tag, len_encode(len(v)), v)) + def _encode_cer(self, writer): + write_full(writer, self.tag + LENINDEF) + for v in sorted(encode_cer(v) for v in self._values_for_encoding()): + write_full(writer, v) + write_full(writer, EOC) + def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode): return super(SetOf, self)._decode( tlv, @@ -6540,14 +7143,22 @@ def main(): # pragma: no cover help="Allow explicit tag out-of-bound", ) parser.add_argument( - "DERFile", + "--evgen", + action="store_true", + help="Turn on event generation mode", + ) + parser.add_argument( + "RAWFile", type=argparse.FileType("rb"), - help="Path to DER file you want to decode", + help="Path to BER/CER/DER file you want to decode", ) args = parser.parse_args() - args.DERFile.seek(args.skip) - der = memoryview(args.DERFile.read()) - args.DERFile.close() + if PY2: + args.RAWFile.seek(args.skip) + raw = memoryview(args.RAWFile.read()) + args.RAWFile.close() + else: + raw = file_mmaped(args.RAWFile)[args.skip:] oid_maps = ( [obj_by_path(_path) for _path in (args.oids or "").split(",")] if args.oids else () @@ -6564,10 +7175,9 @@ def main(): # pragma: no cover } if args.defines_by_path is not None: ctx["defines_by_path"] = obj_by_path(args.defines_by_path) - obj, tail = schema().decode(der, ctx=ctx) from os import environ - print(pprinter( - obj, + pprinter = partial( + pprinter, oid_maps=oid_maps, with_colours=environ.get("NO_COLOR") is None, with_decode_path=args.print_decode_path, @@ -6575,7 +7185,13 @@ def main(): # pragma: no cover () if args.decode_path_only is None else tuple(args.decode_path_only.split(":")) ), - )) + ) + if args.evgen: + for decode_path, obj, tail in schema().decode_evgen(raw, ctx=ctx): + print(pprinter(obj, decode_path=decode_path)) + else: + obj, tail = schema().decode(raw, ctx=ctx) + print(pprinter(obj)) if tail != b"": print("\nTrailing data: %s" % hexenc(tail))