Source code for invenio_files_rest.helpers

# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2016-2019 CERN.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""File serving helpers for Files REST API."""

import hashlib
import mimetypes
import os
import unicodedata
import warnings
from time import time
from urllib.parse import urlsplit

from flask import current_app, make_response, request
from werkzeug.datastructures import Headers
from werkzeug.urls import url_quote
from werkzeug.wsgi import FileWrapper

MIMETYPE_TEXTFILES = {"readme"}

MIMETYPE_WHITELIST = {
    "audio/mpeg",
    "audio/ogg",
    "audio/wav",
    "audio/webm",
    "image/gif",
    "image/jpeg",
    "image/png",
    "image/tiff",
    "text/plain",
}
"""List of whitelisted MIME types.

.. warning::

   Do not add new types to this list unless you know what you are doing. You
   could potentially open up for XSS attacks.
"""

MIMETYPE_PLAINTEXT = {
    "application/javascript",
    "application/json",
    "application/xhtml+xml",
    "application/xml",
    "text/css",
    "text/csv",
    "text/html",
    "image/svg+xml",
}


[docs]def chunk_size_or_default(chunk_size):
    """Use default chunksize if not configured."""
    return chunk_size or 5 * 1024 * 1024  # 5MiB


[docs]def send_stream(
    stream,
    filename,
    size,
    mtime,
    mimetype=None,
    restricted=True,
    as_attachment=False,
    etag=None,
    content_md5=None,
    chunk_size=None,
    conditional=True,
    trusted=False,
):
    """Send the contents of a file to the client.

    .. warning::

        It is very easy to be exposed to Cross-Site Scripting (XSS) attacks if
        you serve user uploaded files. Here are some recommendations:

            1. Serve user uploaded files from a separate domain
               (not a subdomain). This way a malicious file can only attack
               other user uploaded files.
            2. Prevent the browser from rendering and executing HTML files (by
               setting ``trusted=False``).
            3. Force the browser to download the file as an attachment
               (``as_attachment=True``).

    :param stream: The file stream to send.
    :param filename: The file name.
    :param size: The file size.
    :param mtime: A Unix timestamp that represents last modified time (UTC).
    :param mimetype: The file mimetype. If ``None``, the module will try to
        guess. (Default: ``None``)
    :param restricted: If the file is not restricted, the module will set the
        cache-control. (Default: ``True``)
    :param as_attachment: If the file is an attachment. (Default: ``False``)
    :param etag: If defined, it will be set as HTTP E-Tag.
    :param content_md5: If defined, a HTTP Content-MD5 header will be set.
    :param chunk_size: The chunk size.
    :param conditional: Make the response conditional to the request.
        (Default: ``True``)
    :param trusted: Do not enable this option unless you know what you are
        doing. By default this function will send HTTP headers and MIME types
        that prevents your browser from rendering e.g. a HTML file which could
        contain a malicious script tag.
        (Default: ``False``)
    :returns: A Flask response instance.
    """
    chunk_size = chunk_size_or_default(chunk_size)

    # Guess mimetype from filename if not provided.
    if mimetype is None and filename:
        mimetype = mimetypes.guess_type(filename)[0]
    if mimetype is None:
        mimetype = "application/octet-stream"

    # Construct headers
    headers = Headers()
    headers["Content-Length"] = size
    if content_md5:
        headers["Content-MD5"] = content_md5

    if not trusted:
        # Sanitize MIME type
        mimetype = sanitize_mimetype(mimetype, filename=filename)
        # See https://www.owasp.org/index.php/OWASP_Secure_Headers_Project
        # Prevent JavaScript execution
        headers["Content-Security-Policy"] = "default-src 'none';"
        # Prevent MIME type sniffing for browser.
        headers["X-Content-Type-Options"] = "nosniff"
        # Prevent opening of downloaded file by IE
        headers["X-Download-Options"] = "noopen"
        # Prevent cross domain requests from Flash/Acrobat.
        headers["X-Permitted-Cross-Domain-Policies"] = "none"
        # Prevent files from being embedded in frame, iframe and object tags.
        headers["X-Frame-Options"] = "deny"
        # Enable XSS protection (IE, Chrome, Safari)
        headers["X-XSS-Protection"] = "1; mode=block"

    # Force Content-Disposition for application/octet-stream to prevent
    # Content-Type sniffing.
    if as_attachment or mimetype == "application/octet-stream":
        # See https://github.com/pallets/flask/commit/0049922f2e690a6d
        try:
            filenames = {"filename": filename.encode("latin-1")}
        except UnicodeEncodeError:
            filenames = {"filename*": "UTF-8''%s" % url_quote(filename)}
            encoded_filename = unicodedata.normalize("NFKD", filename).encode(
                "latin-1", "ignore"
            )
            if encoded_filename:
                filenames["filename"] = encoded_filename
        headers.add("Content-Disposition", "attachment", **filenames)
    else:
        headers.add("Content-Disposition", "inline")

    # Construct response object.
    rv = current_app.response_class(
        FileWrapper(stream, buffer_size=chunk_size),
        mimetype=mimetype,
        headers=headers,
        direct_passthrough=True,
    )

    # Set etag if defined
    if etag:
        rv.set_etag(etag)

    # Set last modified time
    if mtime is not None:
        rv.last_modified = int(mtime)

    # Set cache-control
    if not restricted:
        rv.cache_control.public = True
        cache_timeout = current_app.get_send_file_max_age(filename)
        if cache_timeout is not None:
            rv.cache_control.max_age = cache_timeout
            rv.expires = int(time() + cache_timeout)

    if conditional:
        rv = rv.make_conditional(request)

    return rv


[docs]def sanitize_mimetype(mimetype, filename=None):
    """Sanitize a MIME type so the browser does not render the file."""
    # Allow some few mime type like plain text, images and audio.
    if mimetype in MIMETYPE_WHITELIST:
        return mimetype
    # Rewrite HTML, JavaScript, CSS etc to text/plain.
    if mimetype in MIMETYPE_PLAINTEXT or (
        filename and filename.lower() in MIMETYPE_TEXTFILES
    ):
        return "text/plain"
    # Default
    return "application/octet-stream"


[docs]def make_path(base_uri, path, filename, path_dimensions, split_length):
    """Generate a path as base location for file instance.

    :param base_uri: The base URI.
    :param path: The relative path.
    :param path_dimensions: Number of chunks the path should be split into.
    :param split_length: The length of any chunk.
    :returns: A string representing the full path.
    """
    assert len(path) > path_dimensions * split_length

    uri_parts = []
    for i in range(path_dimensions):
        uri_parts.append(path[0:split_length])
        path = path[split_length:]
    uri_parts.append(path)
    uri_parts.append(filename)

    return os.path.join(base_uri, *uri_parts)


[docs]def compute_md5_checksum(stream, **kwargs):
    """Get helper method to compute MD5 checksum from a stream.

    :param stream: The input stream.
    :returns: The MD5 checksum.
    """
    return compute_checksum(stream, "md5", hashlib.md5(), **kwargs)


[docs]def compute_checksum(
    stream, algo, message_digest, chunk_size=None, progress_callback=None
):
    """Get helper method to compute checksum from a stream.

    :param stream: File-like object.
    :param algo: Identifier for checksum algorithm.
    :param messsage_digest: A message digest instance.
    :param chunk_size: Read at most size bytes from the file at a time.
    :param progress_callback: Function accepting one argument with number
        of bytes read. (Default: ``None``)
    :returns: The checksum.
    """
    chunk_size = chunk_size_or_default(chunk_size)

    bytes_read = 0
    while 1:
        chunk = stream.read(chunk_size)
        if not chunk:
            if progress_callback:
                progress_callback(bytes_read)
            break
        message_digest.update(chunk)
        bytes_read += len(chunk)
        if progress_callback:
            progress_callback(bytes_read)
    return "{0}:{1}".format(algo, message_digest.hexdigest())


[docs]def populate_from_path(bucket, source, checksum=True, key_prefix="", chunk_size=None):
    """Populate a ``bucket`` from all files in path.

    :param bucket: The bucket (instance or id) to create the object in.
    :param source: The file or directory path.
    :param checksum: If ``True`` then a MD5 checksum will be computed for each
        file. (Default: ``True``)
    :param key_prefix: The key prefix for the bucket.
    :param chunk_size: Chunk size to read from file.
    :returns: A iterator for all
        :class:`invenio_files_rest.models.ObjectVersion` instances.
    """
    from .models import FileInstance, ObjectVersion

    def create_file(key, path):
        """Create new ``ObjectVersion`` from path or existing ``FileInstance``.

        It checks MD5 checksum and size of existing ``FileInstance``s.
        """
        key = key_prefix + key

        if checksum:
            file_checksum = compute_md5_checksum(
                open(path, "rb"), chunk_size=chunk_size
            )
            file_instance = FileInstance.query.filter_by(
                checksum=file_checksum, size=os.path.getsize(path)
            ).first()
            if file_instance:
                return ObjectVersion.create(bucket, key, _file_id=file_instance.id)
        return ObjectVersion.create(bucket, key, stream=open(path, "rb"))

    if os.path.isfile(source):
        yield create_file(os.path.basename(source), source)
    else:
        for root, dirs, files in os.walk(source, topdown=False):
            for name in files:
                filename = os.path.join(root, name)
                assert filename.startswith(source)
                parts = [p for p in filename[len(source) :].split(os.sep) if p]
                yield create_file("/".join(parts), os.path.join(root, name))


[docs]def create_file_streaming_redirect_response(obj):
    """Redirect response generating function."""
    warnings.warn("This streaming does not support multiple storage backends.")
    response = make_response()
    redirect_url_base = "/user_files/"
    redirect_url_key = urlsplit(obj.file.uri).path
    response.headers["X-Accel-Redirect"] = redirect_url_base + redirect_url_key[1:]
    return response
Source code for invenio_files_rest.helpers

Invenio-Files-REST

Navigation

Related Topics