X-Git-Url: http://www.git.cypherpunks.ru/?p=pyderasn.git;a=blobdiff_plain;f=pyderasn.py;h=7bb1799863c454bb22bdbdc70c75411c47635f04;hp=24204ce8780ca8813723e4d09d715186fe1d1201;hb=749705c0df79a03dfda53dc0f78044efba8590a6;hpb=e1249a0c754920c57e6d7682458f50fd65b70026

diff --git a/pyderasn.py b/pyderasn.py
index 24204ce..7bb1799 100755
--- a/pyderasn.py
+++ b/pyderasn.py
@@ -232,6 +232,7 @@ Currently available context options:
 * :ref:`allow_unordered_set <allow_unordered_set_ctx>`
 * :ref:`bered <bered_ctx>`
 * :ref:`defines_by_path <defines_by_path_ctx>`
+* :ref:`evgen_mode_upto <evgen_mode_upto_ctx>`
 
 .. _pprinting:
 
@@ -428,7 +429,7 @@ ______________________________
 
 Sometimes you either can not or do not want to explicitly set *defines*
 in the schema. You can dynamically apply those definitions when calling
-``.decode()`` method.
+:py:meth:`pyderasn.Obj.decode` method.
 
 Specify ``defines_by_path`` key in the :ref:`decode context <ctx>`. Its
 value must be sequence of following tuples::
@@ -492,9 +493,9 @@ useful for SEQUENCE/SET OF-s.
 BER encoding
 ------------
 
-By default PyDERASN accepts only DER encoded data. It always encodes to
-DER. But you can optionally enable BER decoding with setting ``bered``
-:ref:`context <ctx>` argument to True. Indefinite lengths and
+By default PyDERASN accepts only DER encoded data. By default it encodes
+to DER. But you can optionally enable BER decoding with setting
+``bered`` :ref:`context <ctx>` argument to True. Indefinite lengths and
 constructed primitive types should be parsed successfully.
 
 * If object is encoded in BER form (not the DER one), then ``ber_encoded``
@@ -533,6 +534,314 @@ lengths will be invalid in that case.
    This option should be used only for skipping some decode errors, just
    to see the decoded structure somehow.
 
+.. _streaming:
+
+Streaming and dealing with huge structures
+------------------------------------------
+
+.. _evgen_mode:
+
+evgen mode
+__________
+
+ASN.1 structures can be huge, they can hold millions of objects inside
+(for example Certificate Revocation Lists (CRL), holding revocation
+state for every previously issued X.509 certificate). CACert.org's 8 MiB
+CRL file takes more than half a gigabyte of memory to hold the decoded
+structure.
+
+If you just simply want to check the signature over the ``tbsCertList``,
+you can create specialized schema with that field represented as
+OctetString for example::
+
+    class TBSCertListFast(Sequence):
+        schema = (
+            [...]
+            ("revokedCertificates", OctetString(
+                impl=SequenceOf.tag_default,
+                optional=True,
+            )),
+            [...]
+        )
+
+This allows you to quickly decode a few fields and check the signature
+over the ``tbsCertList`` bytes.
+
+But how can you get all certificate's serial number from it, after you
+trust that CRL after signature validation? You can use so called
+``evgen`` (event generation) mode, to catch the events/facts of some
+successful object decoding. Let's use command line capabilities::
+
+    $ python -m pyderasn --schema tests.test_crl:CertificateList --evgen revoke.crl
+         10     [1,1,   1]   . . version: Version INTEGER v2 (01) OPTIONAL
+         15     [1,1,   9]   . . . algorithm: OBJECT IDENTIFIER 1.2.840.113549.1.1.13
+         26     [0,0,   2]   . . . parameters: [UNIV 5] ANY OPTIONAL
+         13     [1,1,  13]   . . signature: AlgorithmIdentifier SEQUENCE
+         34     [1,1,   3]   . . . . . . type: AttributeType OBJECT IDENTIFIER 2.5.4.10
+         39     [0,0,   9]   . . . . . . value: [UNIV 19] AttributeValue ANY
+         32     [1,1,  14]   . . . . . 0: AttributeTypeAndValue SEQUENCE
+         30     [1,1,  16]   . . . . 0: RelativeDistinguishedName SET OF
+    [...]
+        188     [1,1,   1]   . . . . userCertificate: CertificateSerialNumber INTEGER 17 (11)
+        191     [1,1,  13]   . . . . . utcTime: UTCTime UTCTime 2003-04-01T14:25:08
+        191     [0,0,  15]   . . . . revocationDate: Time CHOICE utcTime
+        191     [1,1,  13]   . . . . . utcTime: UTCTime UTCTime 2003-04-01T14:25:08
+        186     [1,1,  18]   . . . 0: RevokedCertificate SEQUENCE
+        208     [1,1,   1]   . . . . userCertificate: CertificateSerialNumber INTEGER 20 (14)
+        211     [1,1,  13]   . . . . . utcTime: UTCTime UTCTime 2002-10-01T02:18:01
+        211     [0,0,  15]   . . . . revocationDate: Time CHOICE utcTime
+        211     [1,1,  13]   . . . . . utcTime: UTCTime UTCTime 2002-10-01T02:18:01
+        206     [1,1,  18]   . . . 1: RevokedCertificate SEQUENCE
+    [...]
+    9144992     [0,0,  15]   . . . . revocationDate: Time CHOICE utcTime
+    9144992     [1,1,  13]   . . . . . utcTime: UTCTime UTCTime 2020-02-08T07:25:06
+    9144985     [1,1,  20]   . . . 415755: RevokedCertificate SEQUENCE
+      181     [1,4,9144821]   . . revokedCertificates: RevokedCertificates SEQUENCE OF OPTIONAL
+        5     [1,4,9144997]   . tbsCertList: TBSCertList SEQUENCE
+    9145009     [1,1,   9]   . . algorithm: OBJECT IDENTIFIER 1.2.840.113549.1.1.13
+    9145020     [0,0,   2]   . . parameters: [UNIV 5] ANY OPTIONAL
+    9145007     [1,1,  13]   . signatureAlgorithm: AlgorithmIdentifier SEQUENCE
+    9145022     [1,3, 513]   . signatureValue: BIT STRING 4096 bits
+        0     [1,4,9145534]  CertificateList SEQUENCE
+
+Here we see how decoder works: it decodes SEQUENCE's tag, length, then
+decodes underlying values. It can not tell if SEQUENCE is decoded, so
+the event of the upper level SEQUENCE is the last one we see.
+``version`` field is just a single INTEGER -- it is decoded and event is
+fired immediately. Then we see that ``algorithm`` and ``parameters``
+fields are decoded and only after them the ``signature`` SEQUENCE is
+fired as a successfully decoded. There are 4 events for each revoked
+certificate entry in that CRL: ``userCertificate`` serial number,
+``utcTime`` of ``revocationDate`` CHOICE, ``RevokedCertificate`` itself
+as a one of entity in ``revokedCertificates`` SEQUENCE OF.
+
+We can do that in our ordinary Python code and understand where we are
+by looking at deterministically generated decode paths (do not forget
+about useful ``--print-decode-path`` CLI option). We must use
+:py:meth:`pyderasn.Obj.decode_evgen` method, instead of ordinary
+:py:meth:`pyderasn.Obj.decode`. It is generator yielding ``(decode_path,
+obj, tail)`` tuples::
+
+    for decode_path, obj, _ in CertificateList().decode_evgen(crl_raw):
+        if (
+            len(decode_path) == 4 and
+            decode_path[:2] == ("tbsCertList", "revokedCertificates"),
+            decode_path[3] == "userCertificate"
+        ):
+            print("serial number:", int(obj))
+
+Virtually it does not take any memory except at least needed for single
+object storage. You can easily use that mode to determine required
+object ``.offset`` and ``.*len`` to be able to decode it separately, or
+maybe verify signature upon it just by taking bytes by ``.offset`` and
+``.tlvlen``.
+
+.. _evgen_mode_upto_ctx:
+
+evgen_mode_upto
+_______________
+
+There is full ability to get any kind of data from the CRL in the
+example above. However it is not too convenient to get the whole
+``RevokedCertificate`` structure, that is pretty lightweight and one may
+do not want to disassemble it. You can use ``evgen_mode_upto``
+:ref:`ctx <ctx>` option that semantically equals to
+:ref:`defines_by_path <defines_by_path_ctx>` -- list of decode paths
+mapped to any non-None value. If specified decode path is met, then any
+subsequent objects won't be decoded in evgen mode. That allows us to
+parse the CRL above with fully assembled ``RevokedCertificate``::
+
+    for decode_path, obj, _ in CertificateList().decode_evgen(
+        crl_raw,
+        ctx={"evgen_mode_upto": (
+            (("tbsCertList", "revokedCertificates", any), True),
+        )},
+    ):
+        if (
+            len(decode_path) == 3 and
+            decode_path[:2] == ("tbsCertList", "revokedCertificates"),
+        ):
+            print("serial number:", int(obj["userCertificate"]))
+
+.. _mmap:
+
+mmap-ed file
+____________
+
+POSIX compliant systems have ``mmap`` syscall, giving ability to work
+the memory mapped file. You can deal with the file like it was an
+ordinary binary string, allowing you not to load it to the memory first.
+Also you can use them as an input for OCTET STRING, taking no Python
+memory for their storage.
+
+There is convenient :py:func:`pyderasn.file_mmaped` function that
+creates read-only memoryview on the file contents::
+
+    with open("huge", "rb") as fd:
+        raw = file_mmaped(fd)
+        obj = Something.decode(raw)
+
+.. warning::
+
+   mmap-ed files in Python2.7 does not implement buffer protocol, so
+   memoryview won't work on them.
+
+.. warning::
+
+   mmap maps the **whole** file. So it plays no role if you seek-ed it
+   before. Take the slice of the resulting memoryview with required
+   offset instead.
+
+.. note::
+
+   If you use ZFS as underlying storage, then pay attention that
+   currently most platforms does not deal good with ZFS ARC and ordinary
+   page cache used for mmaps. It can take twice the necessary size in
+   the memory: both in page cache and ZFS ARC.
+
+CER encoding
+____________
+
+We can parse any kind of data now, but how can we produce files
+streamingly, without storing their encoded representation in memory?
+SEQUENCE by default encodes in memory all its values, joins them in huge
+binary string, just to know the exact size of SEQUENCE's value for
+encoding it in TLV. DER requires you to know all exact sizes of the
+objects.
+
+You can use CER encoding mode, that slightly differs from the DER, but
+does not require exact sizes knowledge, allowing streaming encoding
+directly to some writer/buffer. Just use
+:py:meth:`pyderasn.Obj.encode_cer` method, providing the writer where
+encoded data will flow::
+
+    opener = io.open if PY2 else open
+    with opener("result", "wb") as fd:
+        obj.encode_cer(fd.write)
+
+::
+
+    buf = io.BytesIO()
+    obj.encode_cer(buf.write)
+
+If you do not want to create in-memory buffer every time, then you can
+use :py:func:`pyderasn.encode_cer` function::
+
+    data = encode_cer(obj)
+
+Remember that CER is **not valid** DER in most cases, so you **have to**
+use :ref:`bered <bered_ctx>` :ref:`ctx <ctx>` option during its
+decoding. Also currently there is **no** validation that provided CER is
+valid one -- you are sure that it has only valid BER encoding.
+
+.. warning::
+
+   SET OF values can not be streamingly encoded, because they are
+   required to be sorted byte-by-byte. Big SET OF values still will take
+   much memory. Use neither SET nor SET OF values, as modern ASN.1
+   also recommends too.
+
+Do not forget about using :ref:`mmap-ed <mmap>` memoryviews for your
+OCTET STRINGs! They will be streamingly copied from underlying file to
+the buffer using 1 KB chunks.
+
+Some structures require that some of the elements have to be forcefully
+DER encoded. For example ``SignedData`` CMS requires you to encode
+``SignedAttributes`` and X.509 certificates in DER form, allowing you to
+encode everything else in BER. You can tell any of the structures to be
+forcefully encoded in DER during CER encoding, by specifying
+``der_forced=True`` attribute::
+
+    class Certificate(Sequence):
+        schema = (...)
+        der_forced = True
+
+    class SignedAttributes(SetOf):
+        schema = Attribute()
+        bounds = (1, 32)
+        der_forced = True
+
+agg_octet_string
+________________
+
+In most cases, huge quantity of binary data is stored as OCTET STRING.
+CER encoding splits it on 1 KB chunks. BER allows splitting on various
+levels of chunks inclusion::
+
+    SOME STRING[CONSTRUCTED]
+        OCTET STRING[CONSTRUCTED]
+            OCTET STRING[PRIMITIVE]
+                DATA CHUNK
+            OCTET STRING[PRIMITIVE]
+                DATA CHUNK
+            OCTET STRING[PRIMITIVE]
+                DATA CHUNK
+        OCTET STRING[PRIMITIVE]
+            DATA CHUNK
+        OCTET STRING[CONSTRUCTED]
+            OCTET STRING[PRIMITIVE]
+                DATA CHUNK
+            OCTET STRING[PRIMITIVE]
+                DATA CHUNK
+        OCTET STRING[CONSTRUCTED]
+            OCTET STRING[CONSTRUCTED]
+                OCTET STRING[PRIMITIVE]
+                    DATA CHUNK
+
+You can not just take the offset and some ``.vlen`` of the STRING and
+treat it as the payload. If you decode it without
+:ref:`evgen mode <evgen_mode>`, then it will be automatically aggregated
+and ``bytes()`` will give the whole payload contents.
+
+You are forced to use :ref:`evgen mode <evgen_mode>` for decoding for
+small memory footprint. There is convenient
+:py:func:`pyderasn.agg_octet_string` helper for reconstructing the
+payload. Let's assume you have got BER/CER encoded ``ContentInfo`` with
+huge ``SignedData`` and ``EncapsulatedContentInfo``. Let's calculate the
+SHA512 digest of its ``eContent``::
+
+    fd = open("data.p7m", "rb")
+    raw = file_mmaped(fd)
+    ctx = {"bered": True}
+    for decode_path, obj, _ in ContentInfo().decode_evgen(raw, ctx=ctx):
+        if decode_path == ("content",):
+            content = obj
+            break
+    else:
+        raise ValueError("no content found")
+    hasher_state = sha512()
+    def hasher(data):
+        hasher_state.update(data)
+        return len(data)
+    evgens = SignedData().decode_evgen(
+        raw[content.offset:],
+        offset=content.offset,
+        ctx=ctx,
+    )
+    agg_octet_string(evgens, ("encapContentInfo", "eContent"), raw, hasher)
+    fd.close()
+    digest = hasher_state.digest()
+
+Simply replace ``hasher`` with some writeable file's ``fd.write`` to
+copy the payload (without BER/CER encoding interleaved overhead) in it.
+Virtually it won't take memory more than for keeping small structures
+and 1 KB binary chunks.
+
+SEQUENCE OF iterators
+_____________________
+
+You can use iterators as a value in :py:class:`pyderasn.SequenceOf`
+classes. The only difference with providing the full list of objects, is
+that type and bounds checking is done during encoding process. Also
+sequence's value will be emptied after encoding, forcing you to set its
+value again.
+
+This is very useful when you have to create some huge objects, like
+CRLs, with thousands and millions of entities inside. You can write the
+generator taking necessary data from the database and giving the
+``RevokedCertificate`` objects. Only binary representation of that
+objects will take memory during DER encoding.
+
 Base Obj
 --------
 .. autoclass:: pyderasn.Obj
@@ -642,7 +951,10 @@ Various
 -------
 
 .. autofunction:: pyderasn.abs_decode_path
+.. autofunction:: pyderasn.agg_octet_string
 .. autofunction:: pyderasn.colonize_hex
+.. autofunction:: pyderasn.encode_cer
+.. autofunction:: pyderasn.file_mmaped
 .. autofunction:: pyderasn.hexenc
 .. autofunction:: pyderasn.hexdec
 .. autofunction:: pyderasn.tag_encode
@@ -772,6 +1084,7 @@ Now you can print only the specified tree, for example signature algorithm::
                          . . 05:00
 """
 
+from array import array
 from codecs import getdecoder
 from codecs import getencoder
 from collections import namedtuple
@@ -779,7 +1092,10 @@ from collections import OrderedDict
 from copy import copy
 from datetime import datetime
 from datetime import timedelta
+from io import BytesIO
 from math import ceil
+from mmap import mmap
+from mmap import PROT_READ
 from operator import attrgetter
 from string import ascii_letters
 from string import digits
@@ -811,6 +1127,7 @@ except ImportError:  # pragma: no cover
 __version__ = "7.0"
 
 __all__ = (
+    "agg_octet_string",
     "Any",
     "BitString",
     "BMPString",
@@ -819,8 +1136,10 @@ __all__ = (
     "Choice",
     "DecodeError",
     "DecodePathDefBy",
+    "encode_cer",
     "Enumerated",
     "ExceedingData",
+    "file_mmaped",
     "GeneralizedTime",
     "GeneralString",
     "GraphicString",
@@ -886,7 +1205,16 @@ NAMEDTUPLE_KWARGS = {} if version_info < (3, 6) else {"module": __name__}
 SET01 = frozenset("01")
 DECIMALS = frozenset(digits)
 DECIMAL_SIGNS = ".,"
+NEXT_ATTR_NAME = "next" if PY2 else "__next__"
+
+
+def file_mmaped(fd):
+    """Make mmap-ed memoryview for reading from file
 
+    :param fd: file object
+    :returns: memoryview over read-only mmap-ing of the whole file
+    """
+    return memoryview(mmap(fd.fileno(), 0, prot=PROT_READ))
 
 def pureint(value):
     if not set(value) <= DECIMALS:
@@ -1190,6 +1518,27 @@ def len_decode(data):
     return l, 1 + octets_num, data[1 + octets_num:]
 
 
+LEN1K = len_encode(1000)
+
+
+def write_full(writer, data):
+    """Fully write provided data
+
+    :param writer: must comply with ``io.RawIOBase.write`` behaviour
+
+    BytesIO does not guarantee that the whole data will be written at
+    once. That function write everything provided, raising an error if
+    ``writer`` returns None.
+    """
+    data = memoryview(data)
+    written = 0
+    while written != len(data):
+        n = writer(data[written:])
+        if n is None:
+            raise ValueError("can not write to buf")
+        written += n
+
+
 ########################################################################
 # Base class
 ########################################################################
@@ -1314,6 +1663,10 @@ class Obj(object):
         """
         return self._tag_order
 
+    @property
+    def tag_order_cer(self):
+        return self.tag_order
+
     @property
     def tlen(self):
         """See :ref:`decoding`
@@ -1348,7 +1701,7 @@ class Obj(object):
         yield NotImplemented
 
     def encode(self):
-        """Encode the structure
+        """DER encode the structure
 
         :returns: DER representation
         """
@@ -1357,6 +1710,26 @@ class Obj(object):
             return raw
         return b"".join((self._expl, len_encode(len(raw)), raw))
 
+    def encode_cer(self, writer):
+        """CER encode the structure to specified writer
+
+        :param writer: must comply with ``io.RawIOBase.write``
+                       behaviour. It takes slice to be written and
+                       returns number of bytes processed. If it returns
+                       None, then exception will be raised
+        """
+        if self._expl is not None:
+            write_full(writer, self._expl + LENINDEF)
+        if getattr(self, "der_forced", False):
+            write_full(writer, self._encode())
+        else:
+            self._encode_cer(writer)
+        if self._expl is not None:
+            write_full(writer, EOC)
+
+    def _encode_cer(self, writer):
+        write_full(writer, self._encode())
+
     def hexencode(self):
         """Do hexadecimal encoded :py:meth:`pyderasn.Obj.encode`
         """
@@ -1372,6 +1745,26 @@ class Obj(object):
             tag_only=False,
             _ctx_immutable=True,
     ):
+        """Decode the data
+
+        :param data: either binary or memoryview
+        :param int offset: initial data's offset
+        :param bool leavemm: do we need to leave memoryview of remaining
+                    data as is, or convert it to bytes otherwise
+        :param decode_path: current decode path (tuples of strings,
+                            possibly with DecodePathDefBy) with will be
+                            the root for all underlying objects
+        :param ctx: optional :ref:`context <ctx>` governing decoding process
+        :param bool tag_only: decode only the tag, without length and
+                              contents (used only in Choice and Set
+                              structures, trying to determine if tag satisfies
+                              the schema)
+        :param bool _ctx_immutable: do we need to ``copy.copy()`` ``ctx``
+                                    before using it?
+        :returns: (Obj, remaining data)
+
+        .. seealso:: :ref:`decoding`
+        """
         result = next(self.decode_evgen(
             data,
             offset,
@@ -1398,21 +1791,11 @@ class Obj(object):
             _ctx_immutable=True,
             _evgen_mode=True,
     ):
-        """Decode the data
-
-        :param data: either binary or memoryview
-        :param int offset: initial data's offset
-        :param bool leavemm: do we need to leave memoryview of remaining
-                    data as is, or convert it to bytes otherwise
-        :param ctx: optional :ref:`context <ctx>` governing decoding process
-        :param tag_only: decode only the tag, without length and contents
-                         (used only in Choice and Set structures, trying to
-                         determine if tag satisfies the schema)
-        :param _ctx_immutable: do we need to ``copy.copy()`` ``ctx``
-                               before using it?
-        :returns: (Obj, remaining data)
+        """Decode with evgen mode on
 
-        .. seealso:: :ref:`decoding`
+        That method is identical to :py:meth:`pyderasn.Obj.decode`, but
+        it returns the generator producing ``(decode_path, obj, tail)``
+        values. See :ref:`evgen mode <evgen_mode>`.
         """
         if ctx is None:
             ctx = {}
@@ -1648,6 +2031,16 @@ class Obj(object):
             )
 
 
+def encode_cer(obj):
+    """Encode to CER in memory buffer
+
+    :returns bytes: memory buffer contents
+    """
+    buf = BytesIO()
+    obj.encode_cer(buf.write)
+    return buf.getvalue()
+
+
 class DecodePathDefBy(object):
     """DEFINED BY representation inside decode path
     """
@@ -1886,6 +2279,7 @@ def pprint(
         with_colours=False,
         with_decode_path=False,
         decode_path_only=(),
+        decode_path=(),
 ):
     """Pretty print object
 
@@ -1938,7 +2332,7 @@ def pprint(
             else:
                 for row in _pprint_pps(pp):
                     yield row
-    return "\n".join(_pprint_pps(obj.pps()))
+    return "\n".join(_pprint_pps(obj.pps(decode_path)))
 
 
 ########################################################################
@@ -2784,6 +3178,30 @@ class BitString(Obj):
             octets,
         ))
 
+    def _encode_cer(self, writer):
+        bit_len, octets = self._value
+        if len(octets) + 1 <= 1000:
+            write_full(writer, self._encode())
+            return
+        write_full(writer, self.tag_constructed)
+        write_full(writer, LENINDEF)
+        for offset in six_xrange(0, (len(octets) // 999) * 999, 999):
+            write_full(writer, b"".join((
+                BitString.tag_default,
+                LEN1K,
+                int2byte(0),
+                octets[offset:offset + 999],
+            )))
+        tail = octets[offset+999:]
+        if len(tail) > 0:
+            tail = int2byte((8 - bit_len % 8) % 8) + tail
+            write_full(writer, b"".join((
+                BitString.tag_default,
+                len_encode(len(tail)),
+                tail,
+            )))
+        write_full(writer, EOC)
+
     def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode):
         try:
             t, tlen, lv = tag_strip(tlv)
@@ -3063,13 +3481,10 @@ class OctetString(Obj):
     >>> OctetString(b"hell", bounds=(4, 4))
     OCTET STRING 4 bytes 68656c6c
 
-    .. note::
-
-       Pay attention that OCTET STRING can be encoded both in primitive
-       and constructed forms. Decoder always checks constructed form tag
-       additionally to specified primitive one. If BER decoding is
-       :ref:`not enabled <bered_ctx>`, then decoder will fail, because
-       of DER restrictions.
+    Memoryviews can be used as a values. If memoryview is made on
+    mmap-ed file, then it does not take storage inside OctetString
+    itself. In CER encoding mode it will be streamed to the specified
+    writer, copying 1 KB chunks.
     """
     __slots__ = ("tag_constructed", "_bound_min", "_bound_max", "defined")
     tag_default = tag_encode(4)
@@ -3124,12 +3539,12 @@ class OctetString(Obj):
         )
 
     def _value_sanitize(self, value):
-        if value.__class__ == binary_type:
+        if value.__class__ == binary_type or value.__class__ == memoryview:
             pass
         elif issubclass(value.__class__, OctetString):
             value = value._value
         else:
-            raise InvalidValueType((self.__class__, bytes))
+            raise InvalidValueType((self.__class__, bytes, memoryview))
         if not self._bound_min <= len(value) <= self._bound_max:
             raise BoundsError(self._bound_min, len(value), self._bound_max)
         return value
@@ -3169,7 +3584,7 @@ class OctetString(Obj):
 
     def __bytes__(self):
         self._assert_ready()
-        return self._value
+        return bytes(self._value)
 
     def __eq__(self, their):
         if their.__class__ == binary_type:
@@ -3214,6 +3629,28 @@ class OctetString(Obj):
             self._value,
         ))
 
+    def _encode_cer(self, writer):
+        octets = self._value
+        if len(octets) <= 1000:
+            write_full(writer, self._encode())
+            return
+        write_full(writer, self.tag_constructed)
+        write_full(writer, LENINDEF)
+        for offset in six_xrange(0, (len(octets) // 1000) * 1000, 1000):
+            write_full(writer, b"".join((
+                OctetString.tag_default,
+                LEN1K,
+                octets[offset:offset + 1000],
+            )))
+        tail = octets[offset+1000:]
+        if len(tail) > 0:
+            write_full(writer, b"".join((
+                OctetString.tag_default,
+                len_encode(len(tail)),
+                tail,
+            )))
+        write_full(writer, EOC)
+
     def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode):
         try:
             t, tlen, lv = tag_strip(tlv)
@@ -3450,6 +3887,31 @@ class OctetString(Obj):
             yield pp
 
 
+def agg_octet_string(evgens, decode_path, raw, writer):
+    """Aggregate constructed string (OctetString and its derivatives)
+
+    :param evgens: iterator of generated events
+    :param decode_path: points to the string we want to decode
+    :param raw: slicebable (memoryview, bytearray, etc) with
+                the data evgens are generated on
+    :param writer: buffer.write where string is going to be saved
+    :param writer: where string is going to be saved. Must comply
+                   with ``io.RawIOBase.write`` behaviour
+    """
+    decode_path_len = len(decode_path)
+    for dp, obj, _ in evgens:
+        if dp[:decode_path_len] != decode_path:
+            continue
+        if not obj.ber_encoded:
+            write_full(writer, raw[
+                obj.offset + obj.tlen + obj.llen:
+                obj.offset + obj.tlen + obj.llen + obj.vlen -
+                (EOC_LEN if obj.expl_lenindef else 0)
+            ])
+        if len(dp) == decode_path_len:
+            break
+
+
 NullState = namedtuple("NullState", BasicState._fields, **NAMEDTUPLE_KWARGS)
 
 
@@ -3669,7 +4131,7 @@ class ObjectIdentifier(Obj):
 
     def __add__(self, their):
         if their.__class__ == tuple:
-            return self.__class__(self._value + their)
+            return self.__class__(self._value + array("L", their))
         if isinstance(their, self.__class__):
             return self.__class__(self._value + their._value)
         raise InvalidValueType((self.__class__, tuple))
@@ -3679,10 +4141,15 @@ class ObjectIdentifier(Obj):
             return value._value
         if isinstance(value, string_types):
             try:
-                value = tuple(pureint(arc) for arc in value.split("."))
+                value = array("L", (pureint(arc) for arc in value.split(".")))
             except ValueError:
                 raise InvalidOID("unacceptable arcs values")
         if value.__class__ == tuple:
+            try:
+                value = array("L", value)
+            except OverflowError as err:
+                raise InvalidOID(repr(err))
+        if value.__class__ is array:
             if len(value) < 2:
                 raise InvalidOID("less than 2 arcs")
             first_arc = value[0]
@@ -3742,7 +4209,7 @@ class ObjectIdentifier(Obj):
 
     def __eq__(self, their):
         if their.__class__ == tuple:
-            return self._value == their
+            return self._value == array("L", their)
         if not issubclass(their.__class__, ObjectIdentifier):
             return False
         return (
@@ -3834,7 +4301,7 @@ class ObjectIdentifier(Obj):
                 offset=offset,
             )
         v, tail = v[:l], v[l:]
-        arcs = []
+        arcs = array("L")
         ber_encoded = False
         while len(v) > 0:
             i = 0
@@ -3845,10 +4312,23 @@ class ObjectIdentifier(Obj):
                     if ctx.get("bered", False):
                         ber_encoded = True
                     else:
-                        raise DecodeError("non normalized arc encoding")
+                        raise DecodeError(
+                            "non normalized arc encoding",
+                            klass=self.__class__,
+                            decode_path=decode_path,
+                            offset=offset,
+                        )
                 arc = (arc << 7) | (octet & 0x7F)
                 if octet & 0x80 == 0:
-                    arcs.append(arc)
+                    try:
+                        arcs.append(arc)
+                    except OverflowError:
+                        raise DecodeError(
+                            "too huge value for local unsigned long",
+                            klass=self.__class__,
+                            decode_path=decode_path,
+                            offset=offset,
+                        )
                     v = v[i + 1:]
                     break
                 i += 1
@@ -3870,7 +4350,7 @@ class ObjectIdentifier(Obj):
             first_arc = 2
             second_arc -= 80
         obj = self.__class__(
-            value=tuple([first_arc, second_arc] + arcs[1:]),
+            value=array("L", (first_arc, second_arc)) + arcs[1:],
             impl=self.tag,
             expl=self._expl,
             default=self.default,
@@ -4534,6 +5014,9 @@ class UTCTime(VisibleString):
         value = self._encode_time()
         return b"".join((self.tag, len_encode(len(value)), value))
 
+    def _encode_cer(self, writer):
+        write_full(writer, self._encode())
+
     def todatetime(self):
         return self._value
 
@@ -4905,6 +5388,10 @@ class Choice(Obj):
         self._assert_ready()
         return self._value[1].tag_order if self._tag_order is None else self._tag_order
 
+    @property
+    def tag_order_cer(self):
+        return min(v.tag_order_cer for v in itervalues(self.specs))
+
     def __getitem__(self, key):
         if key not in self.specs:
             raise ObjUnknown(key)
@@ -4935,6 +5422,10 @@ class Choice(Obj):
         self._assert_ready()
         return self._value[1].encode()
 
+    def _encode_cer(self, writer):
+        self._assert_ready()
+        self._value[1].encode_cer(writer)
+
     def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode):
         for choice, spec in iteritems(self.specs):
             sub_decode_path = decode_path + (choice,)
@@ -5066,7 +5557,7 @@ class Any(Obj):
     """``ANY`` special type
 
     >>> Any(Integer(-123))
-    ANY 020185
+    ANY INTEGER -123 (0X:7B)
     >>> a = Any(OctetString(b"hello world").encode())
     ANY 040b68656c6c6f20776f726c64
     >>> hexenc(bytes(a))
@@ -5114,9 +5605,9 @@ class Any(Obj):
             return value
         if isinstance(value, self.__class__):
             return value._value
-        if isinstance(value, Obj):
-            return value.encode()
-        raise InvalidValueType((self.__class__, Obj, binary_type))
+        if not isinstance(value, Obj):
+            raise InvalidValueType((self.__class__, Obj, binary_type))
+        return value
 
     @property
     def ready(self):
@@ -5160,9 +5651,13 @@ class Any(Obj):
 
     def __eq__(self, their):
         if their.__class__ == binary_type:
-            return self._value == their
+            if self._value.__class__ == binary_type:
+                return self._value == their
+            return self._value.encode() == their
         if issubclass(their.__class__, Any):
-            return self._value == their._value
+            if self.ready and their.ready:
+                return bytes(self) == bytes(their)
+            return self.ready == their.ready
         return False
 
     def __call__(
@@ -5179,7 +5674,10 @@ class Any(Obj):
 
     def __bytes__(self):
         self._assert_ready()
-        return self._value
+        value = self._value
+        if value.__class__ == binary_type:
+            return value
+        return self._value.encode()
 
     @property
     def tlen(self):
@@ -5187,7 +5685,18 @@ class Any(Obj):
 
     def _encode(self):
         self._assert_ready()
-        return self._value
+        value = self._value
+        if value.__class__ == binary_type:
+            return value
+        return value.encode()
+
+    def _encode_cer(self, writer):
+        self._assert_ready()
+        value = self._value
+        if value.__class__ == binary_type:
+            write_full(writer, value)
+        else:
+            value.encode_cer(writer)
 
     def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode):
         try:
@@ -5264,12 +5773,20 @@ class Any(Obj):
         return pp_console_row(next(self.pps()))
 
     def pps(self, decode_path=()):
+        value = self._value
+        if value is None:
+            pass
+        elif value.__class__ == binary_type:
+            value = None
+        else:
+            value = repr(value)
         yield _pp(
             obj=self,
             asn1_type_name=self.asn1_type_name,
             obj_name=self.__class__.__name__,
             decode_path=decode_path,
-            blob=self._value if self.ready else None,
+            value=value,
+            blob=self._value if self._value.__class__ == binary_type else None,
             optional=self.optional,
             default=self == self.default,
             impl=None if self.tag == self.tag_default else tag_decode(self.tag),
@@ -5591,6 +6108,12 @@ class Sequence(Obj):
         v = b"".join(v.encode() for v in self._values_for_encoding())
         return b"".join((self.tag, len_encode(len(v)), v))
 
+    def _encode_cer(self, writer):
+        write_full(writer, self.tag + LENINDEF)
+        for v in self._values_for_encoding():
+            v.encode_cer(writer)
+        write_full(writer, EOC)
+
     def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode):
         try:
             t, tlen, lv = tag_strip(tlv)
@@ -5853,6 +6376,15 @@ class Set(Sequence):
         ))
         return b"".join((self.tag, len_encode(len(v)), v))
 
+    def _encode_cer(self, writer):
+        write_full(writer, self.tag + LENINDEF)
+        for v in sorted(
+                self._values_for_encoding(),
+                key=attrgetter("tag_order_cer"),
+        ):
+            v.encode_cer(writer)
+        write_full(writer, EOC)
+
     def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode):
         try:
             t, tlen, lv = tag_strip(tlv)
@@ -6049,9 +6581,21 @@ class SequenceOf(Obj):
     >>> ints
     Ints SEQUENCE OF[INTEGER 123, INTEGER 345]
 
-    Also you can initialize sequence with preinitialized values:
+    You can initialize sequence with preinitialized values:
 
     >>> ints = Ints([Integer(123), Integer(234)])
+
+    Also you can use iterator as a value:
+
+    >>> ints = Ints(iter(Integer(i) for i in range(1000000)))
+
+    And it won't be iterated until encoding process. Pay attention that
+    bounds and required schema checks are done only during the encoding
+    process in that case! After encode was called, then value is zeroed
+    back to empty list and you have to set it again. That mode is useful
+    mainly with CER encoding mode, where all objects from the iterable
+    will be streamed to the buffer, without copying all of them to
+    memory first.
     """
     __slots__ = ("spec", "_bound_min", "_bound_max")
     tag_default = tag_encode(form=TagFormConstructed, num=16)
@@ -6095,21 +6639,31 @@ class SequenceOf(Obj):
                 self._value = copy(default_obj._value)
 
     def _value_sanitize(self, value):
+        iterator = False
         if issubclass(value.__class__, SequenceOf):
             value = value._value
+        elif hasattr(value, NEXT_ATTR_NAME):
+            iterator = True
+            value = value
         elif hasattr(value, "__iter__"):
             value = list(value)
         else:
-            raise InvalidValueType((self.__class__, iter))
-        if not self._bound_min <= len(value) <= self._bound_max:
-            raise BoundsError(self._bound_min, len(value), self._bound_max)
-        for v in value:
-            if not isinstance(v, self.spec.__class__):
-                raise InvalidValueType((self.spec.__class__,))
+            raise InvalidValueType((self.__class__, iter, "iterator"))
+        if not iterator:
+            if not self._bound_min <= len(value) <= self._bound_max:
+                raise BoundsError(self._bound_min, len(value), self._bound_max)
+            class_expected = self.spec.__class__
+            for v in value:
+                if not isinstance(v, class_expected):
+                    raise InvalidValueType((class_expected,))
         return value
 
     @property
     def ready(self):
+        if hasattr(self._value, NEXT_ATTR_NAME):
+            return True
+        if self._bound_min > 0 and len(self._value) == 0:
+            return False
         return all(v.ready for v in self._value)
 
     @property
@@ -6119,6 +6673,8 @@ class SequenceOf(Obj):
         return any(v.bered for v in self._value)
 
     def __getstate__(self):
+        if hasattr(self._value, NEXT_ATTR_NAME):
+            raise ValueError("can not pickle SequenceOf with iterator")
         return SequenceOfState(
             __version__,
             self.tag,
@@ -6194,11 +6750,9 @@ class SequenceOf(Obj):
         self._value.append(value)
 
     def __iter__(self):
-        self._assert_ready()
         return iter(self._value)
 
     def __len__(self):
-        self._assert_ready()
         return len(self._value)
 
     def __setitem__(self, key, value):
@@ -6213,8 +6767,43 @@ class SequenceOf(Obj):
         return iter(self._value)
 
     def _encode(self):
-        v = b"".join(v.encode() for v in self._values_for_encoding())
-        return b"".join((self.tag, len_encode(len(v)), v))
+        iterator = hasattr(self._value, NEXT_ATTR_NAME)
+        if iterator:
+            values = []
+            values_append = values.append
+            class_expected = self.spec.__class__
+            values_for_encoding = self._values_for_encoding()
+            self._value = []
+            for v in values_for_encoding:
+                if not isinstance(v, class_expected):
+                    raise InvalidValueType((class_expected,))
+                values_append(v.encode())
+            if not self._bound_min <= len(values) <= self._bound_max:
+                raise BoundsError(self._bound_min, len(values), self._bound_max)
+            value = b"".join(values)
+        else:
+            value = b"".join(v.encode() for v in self._values_for_encoding())
+        return b"".join((self.tag, len_encode(len(value)), value))
+
+    def _encode_cer(self, writer):
+        write_full(writer, self.tag + LENINDEF)
+        iterator = hasattr(self._value, NEXT_ATTR_NAME)
+        if iterator:
+            class_expected = self.spec.__class__
+            values_count = 0
+            values_for_encoding = self._values_for_encoding()
+            self._value = []
+            for v in values_for_encoding:
+                if not isinstance(v, class_expected):
+                    raise InvalidValueType((class_expected,))
+                v.encode_cer(writer)
+                values_count += 1
+            if not self._bound_min <= values_count <= self._bound_max:
+                raise BoundsError(self._bound_min, values_count, self._bound_max)
+        else:
+            for v in self._values_for_encoding():
+                v.encode_cer(writer)
+        write_full(writer, EOC)
 
     def _decode(
             self,
@@ -6407,10 +6996,24 @@ class SetOf(SequenceOf):
     tag_default = tag_encode(form=TagFormConstructed, num=17)
     asn1_type_name = "SET OF"
 
+    def _value_sanitize(self, value):
+        value = super(SetOf, self)._value_sanitize(value)
+        if hasattr(value, NEXT_ATTR_NAME):
+            raise ValueError(
+                "SetOf does not support iterator values, as no sense in them"
+            )
+        return value
+
     def _encode(self):
         v = b"".join(sorted(v.encode() for v in self._values_for_encoding()))
         return b"".join((self.tag, len_encode(len(v)), v))
 
+    def _encode_cer(self, writer):
+        write_full(writer, self.tag + LENINDEF)
+        for v in sorted(encode_cer(v) for v in self._values_for_encoding()):
+            write_full(writer, v)
+        write_full(writer, EOC)
+
     def _decode(self, tlv, offset, decode_path, ctx, tag_only, evgen_mode):
         return super(SetOf, self)._decode(
             tlv,
@@ -6540,14 +7143,22 @@ def main():  # pragma: no cover
         help="Allow explicit tag out-of-bound",
     )
     parser.add_argument(
-        "DERFile",
+        "--evgen",
+        action="store_true",
+        help="Turn on event generation mode",
+    )
+    parser.add_argument(
+        "RAWFile",
         type=argparse.FileType("rb"),
-        help="Path to DER file you want to decode",
+        help="Path to BER/CER/DER file you want to decode",
     )
     args = parser.parse_args()
-    args.DERFile.seek(args.skip)
-    der = memoryview(args.DERFile.read())
-    args.DERFile.close()
+    if PY2:
+        args.RAWFile.seek(args.skip)
+        raw = memoryview(args.RAWFile.read())
+        args.RAWFile.close()
+    else:
+        raw = file_mmaped(args.RAWFile)[args.skip:]
     oid_maps = (
         [obj_by_path(_path) for _path in (args.oids or "").split(",")]
         if args.oids else ()
@@ -6564,10 +7175,9 @@ def main():  # pragma: no cover
     }
     if args.defines_by_path is not None:
         ctx["defines_by_path"] = obj_by_path(args.defines_by_path)
-    obj, tail = schema().decode(der, ctx=ctx)
     from os import environ
-    print(pprinter(
-        obj,
+    pprinter = partial(
+        pprinter,
         oid_maps=oid_maps,
         with_colours=environ.get("NO_COLOR") is None,
         with_decode_path=args.print_decode_path,
@@ -6575,7 +7185,13 @@ def main():  # pragma: no cover
             () if args.decode_path_only is None else
             tuple(args.decode_path_only.split(":"))
         ),
-    ))
+    )
+    if args.evgen:
+        for decode_path, obj, tail in schema().decode_evgen(raw, ctx=ctx):
+            print(pprinter(obj, decode_path=decode_path))
+    else:
+        obj, tail = schema().decode(raw, ctx=ctx)
+        print(pprinter(obj))
     if tail != b"":
         print("\nTrailing data: %s" % hexenc(tail))