From af4b3b06c0d75a01c1fa59f1f6a4bae12e69daa5 Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Tue, 22 Nov 2022 16:15:50 -0500 Subject: [PATCH] Add support for expiring files SUPPLEMENTALLY: - Add an `expiration` field to the `file` table of the database - Produce a migration for the above change - Overhaul the cleanup script, and integrate into fhost.py (now run using FLASK_APP=fhost flask prune) - Replace the old cleanup script with a deprecation notice - Add information about how to expire files to the index - Update the README with information about the new script Squashed commits: Add a note explaining that expired files aren't immediately removed Show correct times on the index page graph Improve the migration script, removing the need for --legacy Use automap in place of an explicit file map in migration Remove vestigial `touch()` Don't crash when upgrading a fresh database Remove vestigial warning about legacy files More efficiently filter to unexpired files when migrating https://git.0x0.st/mia/0x0/pulls/72#issuecomment-224 Coalesce updates to the database during migration https://git.0x0.st/mia/0x0/pulls/72#issuecomment-226 Remove vestigial database model https://git.0x0.st/mia/0x0/pulls/72#issuecomment-261 prune: Stream expired files from the database (as opposed to collecting them all first) config.example.py: Add min & max expiration + description --- README.rst | 4 +- cleanup.py | 48 +------- fhost.py | 165 +++++++++++++++++++++++++-- instance/config.example.py | 13 +++ migrations/versions/939a08e1d6e5_.py | 81 +++++++++++++ templates/index.html | 15 ++- 6 files changed, 269 insertions(+), 57 deletions(-) create mode 100644 migrations/versions/939a08e1d6e5_.py diff --git a/README.rst b/README.rst index f13167c..af9074d 100644 --- a/README.rst +++ b/README.rst @@ -35,8 +35,8 @@ downsides, one of them being that range requests will not work. This is a problem for example when streaming media files: It won’t be possible to seek, and some ISOBMFF (MP4) files will not play at all. -To make files expire, simply create a cronjob that runs ``cleanup.py`` every -now and then. +To make files expire, simply create a cronjob that runs ``FLASK_APP=fhost +flask prune`` every now and then. Before running the service for the first time, run ``FLASK_APP=fhost flask db upgrade``. diff --git a/cleanup.py b/cleanup.py index 0f9a5ce..14fbc61 100755 --- a/cleanup.py +++ b/cleanup.py @@ -1,44 +1,8 @@ #!/usr/bin/env python3 -""" - Copyright © 2020 Mia Herkt - Licensed under the EUPL, Version 1.2 or - as soon as approved - by the European Commission - subsequent versions of the EUPL - (the "License"); - You may not use this work except in compliance with the License. - You may obtain a copy of the license at: - - https://joinup.ec.europa.eu/software/page/eupl - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - either express or implied. - See the License for the specific language governing permissions - and limitations under the License. -""" - -import os -import sys -import time -import datetime -from fhost import app - -os.chdir(os.path.dirname(sys.argv[0])) -os.chdir(app.config["FHOST_STORAGE_PATH"]) - -files = [f for f in os.listdir(".")] - -maxs = app.config["MAX_CONTENT_LENGTH"] -mind = 30 -maxd = 365 - -for f in files: - stat = os.stat(f) - systime = time.time() - age = datetime.timedelta(seconds=(systime - stat.st_mtime)).days - - maxage = mind + (-maxd + mind) * (stat.st_size / maxs - 1) ** 3 - - if age >= maxage: - os.remove(f) +print("This script has been replaced!!") +print("Instead, please run") +print("") +print(" $ FLASK_APP=fhost flask prune") +print("") +exit(1); diff --git a/fhost.py b/fhost.py index 9c2b94b..bda680f 100755 --- a/fhost.py +++ b/fhost.py @@ -22,12 +22,17 @@ from flask import Flask, abort, make_response, redirect, request, send_from_directory, url_for, Response, render_template from flask_sqlalchemy import SQLAlchemy from flask_migrate import Migrate +from sqlalchemy import and_ from jinja2.exceptions import * from jinja2 import ChoiceLoader, FileSystemLoader from hashlib import sha256 from magic import Magic from mimetypes import guess_extension +import click +import os import sys +import time +import typing import requests from validators import url as url_valid from pathlib import Path @@ -121,12 +126,14 @@ class File(db.Model): addr = db.Column(db.UnicodeText) removed = db.Column(db.Boolean, default=False) nsfw_score = db.Column(db.Float) + expiration = db.Column(db.BigInteger) - def __init__(self, sha256, ext, mime, addr): + def __init__(self, sha256, ext, mime, addr, expiration): self.sha256 = sha256 self.ext = ext self.mime = mime self.addr = addr + self.expiration = expiration def getname(self): return u"{0}{1}".format(su.enbase(self.id), self.ext) @@ -139,7 +146,16 @@ class File(db.Model): else: return url_for("get", path=n, _external=True) + "\n" - def store(file_, addr): + """ + requested_expiration can be: + - None, to use the longest allowed file lifespan + - a duration (in hours) that the file should live for + - a timestamp in epoch millis that the file should expire at + + Any value greater that the longest allowed file lifespan will be rounded down to that + value. + """ + def store(file_, requested_expiration: typing.Optional[int], addr): data = file_.read() digest = sha256(data).hexdigest() @@ -175,15 +191,51 @@ class File(db.Model): return ext[:app.config["FHOST_MAX_EXT_LENGTH"]] or ".bin" - f = File.query.filter_by(sha256=digest).first() + # Returns the epoch millisecond that this file should expire + # + # Uses the expiration time provided by the user (requested_expiration) + # upper-bounded by an algorithm that computes the size based on the size of the + # file. + # + # That is, all files are assigned a computed expiration, which can voluntarily + # shortened by the user either by providing a timestamp in epoch millis or a + # duration in hours. + def get_expiration() -> int: + current_epoch_millis = time.time() * 1000; + # Maximum lifetime of the file in milliseconds + this_files_max_lifespan = get_max_lifespan(len(data)); + + # The latest allowed expiration date for this file, in epoch millis + this_files_max_expiration = this_files_max_lifespan + 1000 * time.time(); + + if requested_expiration is None: + return this_files_max_expiration + elif requested_expiration < 1650460320000: + # Treat the requested expiration time as a duration in hours + requested_expiration_ms = requested_expiration * 60 * 60 * 1000 + return min(this_files_max_expiration, current_epoch_millis + requested_expiration_ms) + else: + # Treat the requested expiration time as a timestamp in epoch millis + return min(this_files_max_expiration, requested_expiration); + + f = File.query.filter_by(sha256=digest).first() if f: + # If the file already exists if f.removed: + # The file was removed by moderation, so don't accept it back abort(451) + if f.expiration is None: + # The file has expired, so give it a new expiration date + f.expiration = get_expiration() + else: + # The file already exists, update the expiration if needed + f.expiration = max(f.expiration, get_expiration()) else: mime = get_mime() ext = get_ext(mime) - f = File(digest, ext, mime, addr) + expiration = get_expiration() + f = File(digest, ext, mime, addr, expiration) f.addr = addr @@ -194,8 +246,6 @@ class File(db.Model): if not p.is_file(): with open(p, "wb") as of: of.write(data) - else: - p.touch() if not f.nsfw_score and app.config["NSFW_DETECT"]: f.nsfw_score = nsfw.detect(p) @@ -260,11 +310,20 @@ def in_upload_bl(addr): return False -def store_file(f, addr): +""" +requested_expiration can be: + - None, to use the longest allowed file lifespan + - a duration (in hours) that the file should live for + - a timestamp in epoch millis that the file should expire at + +Any value greater that the longest allowed file lifespan will be rounded down to that +value. +""" +def store_file(f, requested_expiration: typing.Optional[int], addr): if in_upload_bl(addr): return "Your host is blocked from uploading files.\n", 451 - sf = File.store(f, addr) + sf = File.store(f, requested_expiration, addr) return sf.geturl() @@ -289,7 +348,7 @@ def store_url(url, addr): f = urlfile(read=r.raw.read, content_type=r.headers["content-type"], filename="") - return store_file(f, addr) + return store_file(f, None, addr) else: abort(413) else: @@ -336,7 +395,23 @@ def fhost(): sf = None if "file" in request.files: - return store_file(request.files["file"], request.remote_addr) + try: + # Store the file with the requested expiration date + return store_file( + request.files["file"], + int(request.form["expires"]), + request.remote_addr + ) + except ValueError: + # The requested expiration date wasn't properly formed + abort(400) + except KeyError: + # No expiration date was requested, store with the max lifespan + return store_file( + request.files["file"], + None, + request.remote_addr + ) elif "url" in request.form: return store_url(request.form["url"], request.remote_addr) elif "shorten" in request.form: @@ -364,3 +439,73 @@ def ehandler(e): return render_template(f"{e.code}.html", id=id), e.code except TemplateNotFound: return "Segmentation fault\n", e.code + +@app.cli.command("prune") +def prune(): + """ + Clean up expired files + + Deletes any files from the filesystem which have hit their expiration time. This + doesn't remove them from the database, only from the filesystem. It's recommended + that server owners run this command regularly, or set it up on a timer. + """ + current_time = time.time() * 1000; + + # The path to where uploaded files are stored + storage = Path(app.config["FHOST_STORAGE_PATH"]) + + # A list of all files who've passed their expiration times + expired_files = File.query\ + .where( + and_( + File.expiration.is_not(None), + File.expiration < current_time + ) + ) + + files_removed = 0; + + # For every expired file... + for file in expired_files: + # Log the file we're about to remove + file_name = file.getname() + file_hash = file.sha256 + file_path = storage / file_hash + print(f"Removing expired file {file_name} [{file_hash}]") + + # Remove it from the file system + try: + os.remove(file_path) + files_removed += 1; + except FileNotFoundError: + pass # If the file was already gone, we're good + except OSError as e: + print(e) + print( + "\n------------------------------------" + "Encountered an error while trying to remove file {file_path}. Double" + "check to make sure the server is configured correctly, permissions are" + "okay, and everything is ship shape, then try again.") + return; + + # Finally, mark that the file was removed + file.expiration = None; + db.session.commit() + + print(f"\nDone! {files_removed} file(s) removed") + +""" For a file of a given size, determine the largest allowed lifespan of that file + +Based on the current app's configuration: Specifically, the MAX_CONTENT_LENGTH, as well +as FHOST_{MIN,MAX}_EXPIRATION. + +This lifespan may be shortened by a user's request, but no files should be allowed to +expire at a point after this number. + +Value returned is a duration in milliseconds. +""" +def get_max_lifespan(filesize: int) -> int: + min_exp = app.config.get("FHOST_MIN_EXPIRATION", 30 * 24 * 60 * 60 * 1000) + max_exp = app.config.get("FHOST_MAX_EXPIRATION", 365 * 24 * 60 * 60 * 1000) + max_size = app.config.get("MAX_CONTENT_LENGTH", 256 * 1024 * 1024) + return min_exp + int((-max_exp + min_exp) * (filesize / max_size - 1) ** 3) diff --git a/instance/config.example.py b/instance/config.example.py index 64c977c..019ec11 100644 --- a/instance/config.example.py +++ b/instance/config.example.py @@ -45,6 +45,19 @@ MAX_CONTENT_LENGTH = 256 * 1024 * 1024 # Default: 256MiB MAX_URL_LENGTH = 4096 +# The minimum and maximum amount of time we'll retain a file for +# +# Small files (nearing zero bytes) are stored for the longest possible expiration date, +# while larger files (nearing MAX_CONTENT_LENGTH bytes) are stored for the shortest amount +# of time. Values between these two extremes are interpolated with an exponential curve, +# like the one shown on the index page. +# +# All times are in milliseconds. If you want all files to be stored for the same amount +# of time, set these to the same value. +FHOST_MIN_EXPIRATION = 30 * 24 * 60 * 60 * 1000 +FHOST_MAX_EXPIRATION = 365 * 24 * 60 * 60 * 1000 + + # Use the X-SENDFILE header to speed up serving files w/ compatible webservers # # Some webservers can be configured use the X-Sendfile header to handle sending diff --git a/migrations/versions/939a08e1d6e5_.py b/migrations/versions/939a08e1d6e5_.py new file mode 100644 index 0000000..f86dcb3 --- /dev/null +++ b/migrations/versions/939a08e1d6e5_.py @@ -0,0 +1,81 @@ +"""add file expirations + +Revision ID: 939a08e1d6e5 +Revises: 7e246705da6a +Create Date: 2022-11-22 12:16:32.517184 + +""" + +# revision identifiers, used by Alembic. +revision = '939a08e1d6e5' +down_revision = '7e246705da6a' + +from alembic import op +from flask import current_app +from flask_sqlalchemy import SQLAlchemy +from pathlib import Path +import sqlalchemy as sa +from sqlalchemy.ext.automap import automap_base +from sqlalchemy.orm import Session + +import os +import time + +""" For a file of a given size, determine the largest allowed lifespan of that file + +Based on the current app's configuration: Specifically, the MAX_CONTENT_LENGTH, as well +as FHOST_{MIN,MAX}_EXPIRATION. + +This lifespan may be shortened by a user's request, but no files should be allowed to +expire at a point after this number. + +Value returned is a duration in milliseconds. +""" +def get_max_lifespan(filesize: int) -> int: + min_exp = current_app.config.get("FHOST_MIN_EXPIRATION", 30 * 24 * 60 * 60 * 1000) + max_exp = current_app.config.get("FHOST_MAX_EXPIRATION", 365 * 24 * 60 * 60 * 1000) + max_size = current_app.config.get("MAX_CONTENT_LENGTH", 256 * 1024 * 1024) + return min_exp + int((-max_exp + min_exp) * (filesize / max_size - 1) ** 3) + +Base = automap_base() + +def upgrade(): + op.add_column('file', sa.Column('expiration', sa.BigInteger())) + + bind = op.get_bind() + Base.prepare(autoload_with=bind) + File = Base.classes.file + session = Session(bind=bind) + + storage = Path(current_app.config["FHOST_STORAGE_PATH"]) + current_time = time.time() * 1000; + + # List of file hashes which have not expired yet + # This could get really big for some servers + try: + unexpired_files = os.listdir(storage) + except FileNotFoundError: + return # There are no currently unexpired files + + # Calculate an expiration date for all existing files + files = session.scalars( + sa.select(File) + .where( + sa.not_(File.removed), + File.sha256.in_(unexpired_files) + ) + ) + updates = [] # We coalesce updates to the database here + for file in files: + file_path = storage / file.sha256 + stat = os.stat(file_path) + max_age = get_max_lifespan(stat.st_size) # How long the file is allowed to live, in ms + file_birth = stat.st_mtime * 1000 # When the file was created, in ms + updates.append({'id': file.id, 'expiration': int(file_birth + max_age)}) + + # Apply coalesced updates + session.bulk_update_mappings(File, updates) + session.commit() + +def downgrade(): + op.drop_column('file', 'expiration') diff --git a/templates/index.html b/templates/index.html index cef9de2..6f84f59 100644 --- a/templates/index.html +++ b/templates/index.html @@ -11,6 +11,15 @@ Or you can shorten URLs: File URLs are valid for at least 30 days and up to a year (see below). Shortened URLs do not expire. + +Files can be set to expire sooner by adding an "expires" parameter (in hours) + curl -F'file=@yourfile.png' -F'expires=24' {{ fhost_url }} +OR by setting "expires" to a timestamp in epoch milliseconds + curl -F'file=@yourfile.png' -F'expires=1681996320000' {{ fhost_url }} + +Expired files won't be removed immediately, but will be removed as part of +the next purge. + {% set max_size = config["MAX_CONTENT_LENGTH"]|filesizeformat(True) %} Maximum file size: {{ max_size }} Not allowed: {{ config["FHOST_MIME_BLACKLIST"]|join(", ") }} @@ -22,7 +31,7 @@ FILE RETENTION PERIOD retention = min_age + (-max_age + min_age) * pow((file_size / max_size - 1), 3) days - 365 | \\ + {{'{: 6}'.format(config.get("FHOST_MAX_EXPIRATION", 31536000000)//86400000)}} | \\ | \\ | \\ | \\ @@ -30,7 +39,7 @@ retention = min_age + (-max_age + min_age) * pow((file_size / max_size - 1), 3) | \\ | .. | \\ - 197.5 | ----------..------------------------------------------- + {{'{: 6.1f}'.format((config.get("FHOST_MIN_EXPIRATION", 2592000000)/2 + config.get("FHOST_MAX_EXPIRATION", 31536000000)/2)/86400000)}} | ----------..------------------------------------------- | .. | \\ | .. @@ -39,7 +48,7 @@ retention = min_age + (-max_age + min_age) * pow((file_size / max_size - 1), 3) | ... | .... | ...... - 30 | .................... + {{'{: 6}'.format(config.get("FHOST_MIN_EXPIRATION", 2592000000)//86400000)}} | .................... 0{{ ((config["MAX_CONTENT_LENGTH"]/2)|filesizeformat(True)).split(" ")[0].rjust(27) }}{{ max_size.split(" ")[0].rjust(27) }} {{ max_size.split(" ")[1].rjust(54) }}