Up to date documentation

[pyderasn.git] / pyderasn.py
diff --git a/pyderasn.py b/pyderasn.py

index 62776b4bbb7570aabdf37ace1ce6d0e52be14551..68dd21339f62e6801e88788215eb19fcda613fb7 100755 (executable)
--- a/pyderasn.py
+++ b/pyderasn.py
@@ -232,6 +232,7 @@ Currently available context options:
  * :ref:`allow_unordered_set <allow_unordered_set_ctx>`
  * :ref:`bered <bered_ctx>`
  * :ref:`defines_by_path <defines_by_path_ctx>`
  * :ref:`allow_unordered_set <allow_unordered_set_ctx>`
  * :ref:`bered <bered_ctx>`
  * :ref:`defines_by_path <defines_by_path_ctx>`
+* :ref:`evgen_mode_upto <evgen_mode_upto_ctx>`
  
  .. _pprinting:
  
  
  .. _pprinting:
  
@@ -428,7 +429,7 @@ ______________________________
  
  Sometimes you either can not or do not want to explicitly set *defines*
  in the schema. You can dynamically apply those definitions when calling
  
  Sometimes you either can not or do not want to explicitly set *defines*
  in the schema. You can dynamically apply those definitions when calling
-``.decode()`` method.
+:py:meth:`pyderasn.Obj.decode` method.
  
  Specify ``defines_by_path`` key in the :ref:`decode context <ctx>`. Its
  value must be sequence of following tuples::
  
  Specify ``defines_by_path`` key in the :ref:`decode context <ctx>`. Its
  value must be sequence of following tuples::
@@ -492,9 +493,9 @@ useful for SEQUENCE/SET OF-s.
  BER encoding
  ------------
  
  BER encoding
  ------------
  
-By default PyDERASN accepts only DER encoded data. It always encodes to
-DER. But you can optionally enable BER decoding with setting ``bered``
-:ref:`context <ctx>` argument to True. Indefinite lengths and
+By default PyDERASN accepts only DER encoded data. By default it encodes
+to DER. But you can optionally enable BER decoding with setting
+``bered`` :ref:`context <ctx>` argument to True. Indefinite lengths and
  constructed primitive types should be parsed successfully.
  
  * If object is encoded in BER form (not the DER one), then ``ber_encoded``
  constructed primitive types should be parsed successfully.
  
  * If object is encoded in BER form (not the DER one), then ``ber_encoded``
@@ -533,6 +534,314 @@ lengths will be invalid in that case.
     This option should be used only for skipping some decode errors, just
     to see the decoded structure somehow.
  
     This option should be used only for skipping some decode errors, just
     to see the decoded structure somehow.
  
+.. _streaming:
+
+Streaming and dealing with huge structures
+------------------------------------------
+
+.. _evgen_mode:
+
+evgen mode
+__________
+
+ASN.1 structures can be huge, they can hold millions of objects inside
+(for example Certificate Revocation Lists (CRL), holding revocation
+state for every previously issued X.509 certificate). CACert.org's 8 MiB
+CRL file takes more than half a gigabyte of memory to hold the decoded
+structure.
+
+If you just simply want to check the signature over the ``tbsCertList``,
+you can create specialized schema with that field represented as
+OctetString for example::
+
+    class TBSCertListFast(Sequence):
+        schema = (
+            [...]
+            ("revokedCertificates", OctetString(
+                impl=SequenceOf.tag_default,
+                optional=True,
+            )),
+            [...]
+        )
+
+This allows you to quickly decode a few fields and check the signature
+over the ``tbsCertList`` bytes.
+
+But how can you get all certificate's serial number from it, after you
+trust that CRL after signature validation? You can use so called
+``evgen`` (event generation) mode, to catch the events/facts of some
+successful object decoding. Let's use command line capabilities::
+
+    $ python -m pyderasn --schema tests.test_crl:CertificateList --evgen revoke.crl
+         10     [1,1,   1]   . . version: Version INTEGER v2 (01) OPTIONAL
+         15     [1,1,   9]   . . . algorithm: OBJECT IDENTIFIER 1.2.840.113549.1.1.13
+         26     [0,0,   2]   . . . parameters: [UNIV 5] ANY OPTIONAL
+         13     [1,1,  13]   . . signature: AlgorithmIdentifier SEQUENCE
+         34     [1,1,   3]   . . . . . . type: AttributeType OBJECT IDENTIFIER 2.5.4.10
+         39     [0,0,   9]   . . . . . . value: [UNIV 19] AttributeValue ANY
+         32     [1,1,  14]   . . . . . 0: AttributeTypeAndValue SEQUENCE
+         30     [1,1,  16]   . . . . 0: RelativeDistinguishedName SET OF
+    [...]
+        188     [1,1,   1]   . . . . userCertificate: CertificateSerialNumber INTEGER 17 (11)
+        191     [1,1,  13]   . . . . . utcTime: UTCTime UTCTime 2003-04-01T14:25:08
+        191     [0,0,  15]   . . . . revocationDate: Time CHOICE utcTime
+        191     [1,1,  13]   . . . . . utcTime: UTCTime UTCTime 2003-04-01T14:25:08
+        186     [1,1,  18]   . . . 0: RevokedCertificate SEQUENCE
+        208     [1,1,   1]   . . . . userCertificate: CertificateSerialNumber INTEGER 20 (14)
+        211     [1,1,  13]   . . . . . utcTime: UTCTime UTCTime 2002-10-01T02:18:01
+        211     [0,0,  15]   . . . . revocationDate: Time CHOICE utcTime
+        211     [1,1,  13]   . . . . . utcTime: UTCTime UTCTime 2002-10-01T02:18:01
+        206     [1,1,  18]   . . . 1: RevokedCertificate SEQUENCE
+    [...]
+    9144992     [0,0,  15]   . . . . revocationDate: Time CHOICE utcTime
+    9144992     [1,1,  13]   . . . . . utcTime: UTCTime UTCTime 2020-02-08T07:25:06
+    9144985     [1,1,  20]   . . . 415755: RevokedCertificate SEQUENCE
+      181     [1,4,9144821]   . . revokedCertificates: RevokedCertificates SEQUENCE OF OPTIONAL
+        5     [1,4,9144997]   . tbsCertList: TBSCertList SEQUENCE
+    9145009     [1,1,   9]   . . algorithm: OBJECT IDENTIFIER 1.2.840.113549.1.1.13
+    9145020     [0,0,   2]   . . parameters: [UNIV 5] ANY OPTIONAL
+    9145007     [1,1,  13]   . signatureAlgorithm: AlgorithmIdentifier SEQUENCE
+    9145022     [1,3, 513]   . signatureValue: BIT STRING 4096 bits
+        0     [1,4,9145534]  CertificateList SEQUENCE
+
+Here we see how decoder works: it decodes SEQUENCE's tag, length, then
+decodes underlying values. It can not tell if SEQUENCE is decoded, so
+the event of the upper level SEQUENCE is the last one we see.
+``version`` field is just a single INTEGER -- it is decoded and event is
+fired immediately. Then we see that ``algorithm`` and ``parameters``
+fields are decoded and only after them the ``signature`` SEQUENCE is
+fired as a successfully decoded. There are 4 events for each revoked
+certificate entry in that CRL: ``userCertificate`` serial number,
+``utcTime`` of ``revocationDate`` CHOICE, ``RevokedCertificate`` itself
+as a one of entity in ``revokedCertificates`` SEQUENCE OF.
+
+We can do that in our ordinary Python code and understand where we are
+by looking at deterministically generated decode paths (do not forget
+about useful ``--print-decode-path`` CLI option). We must use
+:py:meth:`pyderasn.Obj.decode_evgen` method, instead of ordinary
+:py:meth:`pyderasn.Obj.decode`. It is generator yielding ``(decode_path,
+obj, tail)`` tuples::
+
+    for decode_path, obj, _ in CertificateList().decode_evgen(crl_raw):
+        if (
+            len(decode_path) == 4 and
+            decode_path[:2] == ("tbsCertList", "revokedCertificates"),
+            decode_path[3] == "userCertificate"
+        ):
+            print("serial number:", int(obj))
+
+Virtually it does not take any memory except at least needed for single
+object storage. You can easily use that mode to determine required
+object ``.offset`` and ``.*len`` to be able to decode it separately, or
+maybe verify signature upon it just by taking bytes by ``.offset`` and
+``.tlvlen``.
+
+.. _evgen_mode_upto_ctx:
+
+evgen_mode_upto
+_______________
+
+There is full ability to get any kind of data from the CRL in the
+example above. However it is not too convenient to get the whole
+``RevokedCertificate`` structure, that is pretty lightweight and one may
+do not want to disassemble it. You can use ``evgen_mode_upto``
+:ref:`ctx <ctx>` option that semantically equals to
+:ref:`defines_by_path <defines_by_path_ctx>` -- list of decode paths
+mapped to any non-None value. If specified decode path is met, then any
+subsequent objects won't be decoded in evgen mode. That allows us to
+parse the CRL above with fully assembled ``RevokedCertificate``::
+
+    for decode_path, obj, _ in CertificateList().decode_evgen(
+        crl_raw,
+        ctx={"evgen_mode_upto": (
+            (("tbsCertList", "revokedCertificates", any), True),
+        )},
+    ):
+        if (
+            len(decode_path) == 3 and
+            decode_path[:2] == ("tbsCertList", "revokedCertificates"),
+        ):
+            print("serial number:", int(obj["userCertificate"]))
+
+.. _mmap:
+
+mmap-ed file
+____________
+
+POSIX compliant systems have ``mmap`` syscall, giving ability to work
+the memory mapped file. You can deal with the file like it was an
+ordinary binary string, allowing you not to load it to the memory first.
+Also you can use them as an input for OCTET STRING, taking no Python
+memory for their storage.
+
+There is convenient :py:func:`pyderasn.file_mmaped` function that
+creates read-only memoryview on the file contents::
+
+    with open("huge", "rb") as fd:
+        raw = file_mmaped(fd)
+        obj = Something.decode(raw)
+
+.. warning::
+
+   mmap-ed files in Python2.7 does not implement buffer protocol, so
+   memoryview won't work on them.
+
+.. warning::
+
+   mmap maps the **whole** file. So it plays no role if you seek-ed it
+   before. Take the slice of the resulting memoryview with required
+   offset instead.
+
+.. note::
+
+   If you use ZFS as underlying storage, then pay attention that
+   currently most platforms does not deal good with ZFS ARC and ordinary
+   page cache used for mmaps. It can take twice the necessary size in
+   the memory: both in page cache and ZFS ARC.
+
+CER encoding
+____________
+
+We can parse any kind of data now, but how can we produce files
+streamingly, without storing their encoded representation in memory?
+SEQUENCE by default encodes in memory all its values, joins them in huge
+binary string, just to know the exact size of SEQUENCE's value for
+encoding it in TLV. DER requires you to know all exact sizes of the
+objects.
+
+You can use CER encoding mode, that slightly differs from the DER, but
+does not require exact sizes knowledge, allowing streaming encoding
+directly to some writer/buffer. Just use
+:py:meth:`pyderasn.Obj.encode_cer` method, providing the writer where
+encoded data will flow::
+
+    opener = io.open if PY2 else open
+    with opener("result", "wb") as fd:
+        obj.encode_cer(fd.write)
+
+::
+
+    buf = io.BytesIO()
+    obj.encode_cer(buf.write)
+
+If you do not want to create in-memory buffer every time, then you can
+use :py:func:`pyderasn.encode_cer` function::
+
+    data = encode_cer(obj)
+
+Remember that CER is **not valid** DER in most cases, so you **have to**
+use :ref:`bered <bered_ctx>` :ref:`ctx <ctx>` option during its
+decoding. Also currently there is **no** validation that provided CER is
+valid one -- you are sure that it has only valid BER encoding.
+
+.. warning::
+
+   SET OF values can not be streamingly encoded, because they are
+   required to be sorted byte-by-byte. Big SET OF values still will take
+   much memory. Use neither SET nor SET OF values, as modern ASN.1
+   also recommends too.
+
+Do not forget about using :ref:`mmap-ed <mmap>` memoryviews for your
+OCTET STRINGs! They will be streamingly copied from underlying file to
+the buffer using 1 KB chunks.
+
+Some structures require that some of the elements have to be forcefully
+DER encoded. For example ``SignedData`` CMS requires you to encode
+``SignedAttributes`` and X.509 certificates in DER form, allowing you to
+encode everything else in BER. You can tell any of the structures to be
+forcefully encoded in DER during CER encoding, by specifying
+``der_forced=True`` attribute::
+
+    class Certificate(Sequence):
+        schema = (...)
+        der_forced = True
+
+    class SignedAttributes(SetOf):
+        schema = Attribute()
+        bounds = (1, 32)
+        der_forced = True
+
+agg_octet_string
+________________
+
+In most cases, huge quantity of binary data is stored as OCTET STRING.
+CER encoding splits it on 1 KB chunks. BER allows splitting on various
+levels of chunks inclusion::
+
+    SOME STRING[CONSTRUCTED]
+        OCTET STRING[CONSTRUCTED]
+            OCTET STRING[PRIMITIVE]
+                DATA CHUNK
+            OCTET STRING[PRIMITIVE]
+                DATA CHUNK
+            OCTET STRING[PRIMITIVE]
+                DATA CHUNK
+        OCTET STRING[PRIMITIVE]
+            DATA CHUNK
+        OCTET STRING[CONSTRUCTED]
+            OCTET STRING[PRIMITIVE]
+                DATA CHUNK
+            OCTET STRING[PRIMITIVE]
+                DATA CHUNK
+        OCTET STRING[CONSTRUCTED]
+            OCTET STRING[CONSTRUCTED]
+                OCTET STRING[PRIMITIVE]
+                    DATA CHUNK
+
+You can not just take the offset and some ``.vlen`` of the STRING and
+treat it as the payload. If you decode it without
+:ref:`evgen mode <evgen_mode>`, then it will be automatically aggregated
+and ``bytes()`` will give the whole payload contents.
+
+You are forced to use :ref:`evgen mode <evgen_mode>` for decoding for
+small memory footprint. There is convenient
+:py:func:`pyderasn.agg_octet_string` helper for reconstructing the
+payload. Let's assume you have got BER/CER encoded ``ContentInfo`` with
+huge ``SignedData`` and ``EncapsulatedContentInfo``. Let's calculate the
+SHA512 digest of its ``eContent``::
+
+    fd = open("data.p7m", "rb")
+    raw = file_mmaped(fd)
+    ctx = {"bered": True}
+    for decode_path, obj, _ in ContentInfo().decode_evgen(raw, ctx=ctx):
+        if decode_path == ("content",):
+            content = obj
+            break
+    else:
+        raise ValueError("no content found")
+    hasher_state = sha512()
+    def hasher(data):
+        hasher_state.update(data)
+        return len(data)
+    evgens = SignedData().decode_evgen(
+        raw[content.offset:],
+        offset=content.offset,
+        ctx=ctx,
+    )
+    agg_octet_string(evgens, ("encapContentInfo", "eContent"), raw, hasher)
+    fd.close()
+    digest = hasher_state.digest()
+
+Simply replace ``hasher`` with some writeable file's ``fd.write`` to
+copy the payload (without BER/CER encoding interleaved overhead) in it.
+Virtually it won't take memory more than for keeping small structures
+and 1 KB binary chunks.
+
+SEQUENCE OF iterators
+_____________________
+
+You can use iterators as a value in :py:class:`pyderasn.SequenceOf`
+classes. The only difference with providing the full list of objects, is
+that type and bounds checking is done during encoding process. Also
+sequence's value will be emptied after encoding, forcing you to set its
+value again.
+
+This is very useful when you have to create some huge objects, like
+CRLs, with thousands and millions of entities inside. You can write the
+generator taking necessary data from the database and giving the
+``RevokedCertificate`` objects. Only binary representation of that
+objects will take memory during DER encoding.
+
  Base Obj
  --------
  .. autoclass:: pyderasn.Obj
  Base Obj
  --------
  .. autoclass:: pyderasn.Obj
@@ -642,6 +951,7 @@ Various
  -------
  
  .. autofunction:: pyderasn.abs_decode_path
  -------
  
  .. autofunction:: pyderasn.abs_decode_path
+.. autofunction:: pyderasn.agg_octet_string
  .. autofunction:: pyderasn.colonize_hex
  .. autofunction:: pyderasn.encode_cer
  .. autofunction:: pyderasn.file_mmaped
  .. autofunction:: pyderasn.colonize_hex
  .. autofunction:: pyderasn.encode_cer
  .. autofunction:: pyderasn.file_mmaped
@@ -1213,7 +1523,11 @@ LEN1K = len_encode(1000)
  def write_full(writer, data):
      """Fully write provided data
  
  def write_full(writer, data):
      """Fully write provided data
  
-    BytesIO does not guarantee that the whole data will be written at once.
+    :param writer: must comply with ``io.RawIOBase.write`` behaviour
+
+    BytesIO does not guarantee that the whole data will be written at
+    once. That function write everything provided, raising an error if
+    ``writer`` returns None.
      """
      data = memoryview(data)
      written = 0
      """
      data = memoryview(data)
      written = 0
@@ -1386,7 +1700,7 @@ class Obj(object):
          yield NotImplemented
  
      def encode(self):
          yield NotImplemented
  
      def encode(self):
-        """Encode the structure
+        """DER encode the structure
  
          :returns: DER representation
          """
  
          :returns: DER representation
          """
@@ -1396,6 +1710,13 @@ class Obj(object):
          return b"".join((self._expl, len_encode(len(raw)), raw))
  
      def encode_cer(self, writer):
          return b"".join((self._expl, len_encode(len(raw)), raw))
  
      def encode_cer(self, writer):
+        """CER encode the structure to specified writer
+
+        :param writer: must comply with ``io.RawIOBase.write``
+                       behaviour. It takes slice to be written and
+                       returns number of bytes processed. If it returns
+                       None, then exception will be raised
+        """
          if self._expl is not None:
              write_full(writer, self._expl + LENINDEF)
          if getattr(self, "der_forced", False):
          if self._expl is not None:
              write_full(writer, self._expl + LENINDEF)
          if getattr(self, "der_forced", False):
@@ -1423,6 +1744,26 @@ class Obj(object):
              tag_only=False,
              _ctx_immutable=True,
      ):
              tag_only=False,
              _ctx_immutable=True,
      ):
+        """Decode the data
+
+        :param data: either binary or memoryview
+        :param int offset: initial data's offset
+        :param bool leavemm: do we need to leave memoryview of remaining
+                    data as is, or convert it to bytes otherwise
+        :param decode_path: current decode path (tuples of strings,
+                            possibly with DecodePathDefBy) with will be
+                            the root for all underlying objects
+        :param ctx: optional :ref:`context <ctx>` governing decoding process
+        :param bool tag_only: decode only the tag, without length and
+                              contents (used only in Choice and Set
+                              structures, trying to determine if tag satisfies
+                              the schema)
+        :param bool _ctx_immutable: do we need to ``copy.copy()`` ``ctx``
+                                    before using it?
+        :returns: (Obj, remaining data)
+
+        .. seealso:: :ref:`decoding`
+        """
          result = next(self.decode_evgen(
              data,
              offset,
          result = next(self.decode_evgen(
              data,
              offset,
@@ -1449,21 +1790,11 @@ class Obj(object):
              _ctx_immutable=True,
              _evgen_mode=True,
      ):
              _ctx_immutable=True,
              _evgen_mode=True,
      ):
-        """Decode the data
+        """Decode with evgen mode on
  
  
-        :param data: either binary or memoryview
-        :param int offset: initial data's offset
-        :param bool leavemm: do we need to leave memoryview of remaining
-                    data as is, or convert it to bytes otherwise
-        :param ctx: optional :ref:`context <ctx>` governing decoding process
-        :param tag_only: decode only the tag, without length and contents
-                         (used only in Choice and Set structures, trying to
-                         determine if tag satisfies the schema)
-        :param _ctx_immutable: do we need to ``copy.copy()`` ``ctx``
-                               before using it?
-        :returns: (Obj, remaining data)
-
-        .. seealso:: :ref:`decoding`
+        That method is identical to :py:meth:`pyderasn.Obj.decode`, but
+        it returns the generator producing ``(decode_path, obj, tail)``
+        values. See :ref:`evgen mode <evgen_mode>`.
          """
          if ctx is None:
              ctx = {}
          """
          if ctx is None:
              ctx = {}
@@ -1700,7 +2031,9 @@ class Obj(object):
  
  
  def encode_cer(obj):
  
  
  def encode_cer(obj):
-    """Encode to CER in memory
+    """Encode to CER in memory buffer
+
+    :returns bytes: memory buffer contents
      """
      buf = BytesIO()
      obj.encode_cer(buf.write)
      """
      buf = BytesIO()
      obj.encode_cer(buf.write)
@@ -3559,8 +3892,10 @@ def agg_octet_string(evgens, decode_path, raw, writer):
      :param evgens: iterator of generated events
      :param decode_path: points to the string we want to decode
      :param raw: slicebable (memoryview, bytearray, etc) with
      :param evgens: iterator of generated events
      :param decode_path: points to the string we want to decode
      :param raw: slicebable (memoryview, bytearray, etc) with
-                the data evgens are generated one
+                the data evgens are generated on
      :param writer: buffer.write where string is going to be saved
      :param writer: buffer.write where string is going to be saved
+    :param writer: where string is going to be saved. Must comply
+                   with ``io.RawIOBase.write`` behaviour
      """
      decode_path_len = len(decode_path)
      for dp, obj, _ in evgens:
      """
      decode_path_len = len(decode_path)
      for dp, obj, _ in evgens: