Add support for expiring files

SUPPLEMENTALLY:
- Add an `expiration` field to the `file` table of the database
- Produce a migration for the above change
- Overhaul the cleanup script, and integrate into fhost.py
  (now run using FLASK_APP=fhost flask prune)
- Replace the old cleanup script with a deprecation notice
- Add information about how to expire files to the index
- Update the README with information about the new script

Squashed commits:

Add a note explaining that expired files aren't immediately removed

Show correct times on the index page graph

Improve the migration script, removing the need for --legacy

Use automap in place of an explicit file map in migration

Remove vestigial `touch()`

Don't crash when upgrading a fresh database

Remove vestigial warning about legacy files

More efficiently filter to unexpired files when migrating

#72 (comment)

Coalesce updates to the database during migration

#72 (comment)

Remove vestigial database model

#72 (comment)

prune:  Stream expired files from the database

(as opposed to collecting them all first)

config.example.py:  Add min & max expiration + description
This commit is contained in:
Emi Simpson 2022-11-22 16:15:50 -05:00 committed by Mia Herkt
parent 00dba0e189
commit af4b3b06c0
Signed by: mia
GPG Key ID: 72E154B8622EC191
6 changed files with 269 additions and 57 deletions

View File

@ -35,8 +35,8 @@ downsides, one of them being that range requests will not work. This is a
problem for example when streaming media files: It wont be possible to seek,
and some ISOBMFF (MP4) files will not play at all.
To make files expire, simply create a cronjob that runs ``cleanup.py`` every
now and then.
To make files expire, simply create a cronjob that runs ``FLASK_APP=fhost
flask prune`` every now and then.
Before running the service for the first time, run ``FLASK_APP=fhost flask db upgrade``.

View File

@ -1,44 +1,8 @@
#!/usr/bin/env python3
"""
Copyright © 2020 Mia Herkt
Licensed under the EUPL, Version 1.2 or - as soon as approved
by the European Commission - subsequent versions of the EUPL
(the "License");
You may not use this work except in compliance with the License.
You may obtain a copy of the license at:
https://joinup.ec.europa.eu/software/page/eupl
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
either express or implied.
See the License for the specific language governing permissions
and limitations under the License.
"""
import os
import sys
import time
import datetime
from fhost import app
os.chdir(os.path.dirname(sys.argv[0]))
os.chdir(app.config["FHOST_STORAGE_PATH"])
files = [f for f in os.listdir(".")]
maxs = app.config["MAX_CONTENT_LENGTH"]
mind = 30
maxd = 365
for f in files:
stat = os.stat(f)
systime = time.time()
age = datetime.timedelta(seconds=(systime - stat.st_mtime)).days
maxage = mind + (-maxd + mind) * (stat.st_size / maxs - 1) ** 3
if age >= maxage:
os.remove(f)
print("This script has been replaced!!")
print("Instead, please run")
print("")
print(" $ FLASK_APP=fhost flask prune")
print("")
exit(1);

165
fhost.py
View File

@ -22,12 +22,17 @@
from flask import Flask, abort, make_response, redirect, request, send_from_directory, url_for, Response, render_template
from flask_sqlalchemy import SQLAlchemy
from flask_migrate import Migrate
from sqlalchemy import and_
from jinja2.exceptions import *
from jinja2 import ChoiceLoader, FileSystemLoader
from hashlib import sha256
from magic import Magic
from mimetypes import guess_extension
import click
import os
import sys
import time
import typing
import requests
from validators import url as url_valid
from pathlib import Path
@ -121,12 +126,14 @@ class File(db.Model):
addr = db.Column(db.UnicodeText)
removed = db.Column(db.Boolean, default=False)
nsfw_score = db.Column(db.Float)
expiration = db.Column(db.BigInteger)
def __init__(self, sha256, ext, mime, addr):
def __init__(self, sha256, ext, mime, addr, expiration):
self.sha256 = sha256
self.ext = ext
self.mime = mime
self.addr = addr
self.expiration = expiration
def getname(self):
return u"{0}{1}".format(su.enbase(self.id), self.ext)
@ -139,7 +146,16 @@ class File(db.Model):
else:
return url_for("get", path=n, _external=True) + "\n"
def store(file_, addr):
"""
requested_expiration can be:
- None, to use the longest allowed file lifespan
- a duration (in hours) that the file should live for
- a timestamp in epoch millis that the file should expire at
Any value greater that the longest allowed file lifespan will be rounded down to that
value.
"""
def store(file_, requested_expiration: typing.Optional[int], addr):
data = file_.read()
digest = sha256(data).hexdigest()
@ -175,15 +191,51 @@ class File(db.Model):
return ext[:app.config["FHOST_MAX_EXT_LENGTH"]] or ".bin"
f = File.query.filter_by(sha256=digest).first()
# Returns the epoch millisecond that this file should expire
#
# Uses the expiration time provided by the user (requested_expiration)
# upper-bounded by an algorithm that computes the size based on the size of the
# file.
#
# That is, all files are assigned a computed expiration, which can voluntarily
# shortened by the user either by providing a timestamp in epoch millis or a
# duration in hours.
def get_expiration() -> int:
current_epoch_millis = time.time() * 1000;
# Maximum lifetime of the file in milliseconds
this_files_max_lifespan = get_max_lifespan(len(data));
# The latest allowed expiration date for this file, in epoch millis
this_files_max_expiration = this_files_max_lifespan + 1000 * time.time();
if requested_expiration is None:
return this_files_max_expiration
elif requested_expiration < 1650460320000:
# Treat the requested expiration time as a duration in hours
requested_expiration_ms = requested_expiration * 60 * 60 * 1000
return min(this_files_max_expiration, current_epoch_millis + requested_expiration_ms)
else:
# Treat the requested expiration time as a timestamp in epoch millis
return min(this_files_max_expiration, requested_expiration);
f = File.query.filter_by(sha256=digest).first()
if f:
# If the file already exists
if f.removed:
# The file was removed by moderation, so don't accept it back
abort(451)
if f.expiration is None:
# The file has expired, so give it a new expiration date
f.expiration = get_expiration()
else:
# The file already exists, update the expiration if needed
f.expiration = max(f.expiration, get_expiration())
else:
mime = get_mime()
ext = get_ext(mime)
f = File(digest, ext, mime, addr)
expiration = get_expiration()
f = File(digest, ext, mime, addr, expiration)
f.addr = addr
@ -194,8 +246,6 @@ class File(db.Model):
if not p.is_file():
with open(p, "wb") as of:
of.write(data)
else:
p.touch()
if not f.nsfw_score and app.config["NSFW_DETECT"]:
f.nsfw_score = nsfw.detect(p)
@ -260,11 +310,20 @@ def in_upload_bl(addr):
return False
def store_file(f, addr):
"""
requested_expiration can be:
- None, to use the longest allowed file lifespan
- a duration (in hours) that the file should live for
- a timestamp in epoch millis that the file should expire at
Any value greater that the longest allowed file lifespan will be rounded down to that
value.
"""
def store_file(f, requested_expiration: typing.Optional[int], addr):
if in_upload_bl(addr):
return "Your host is blocked from uploading files.\n", 451
sf = File.store(f, addr)
sf = File.store(f, requested_expiration, addr)
return sf.geturl()
@ -289,7 +348,7 @@ def store_url(url, addr):
f = urlfile(read=r.raw.read, content_type=r.headers["content-type"], filename="")
return store_file(f, addr)
return store_file(f, None, addr)
else:
abort(413)
else:
@ -336,7 +395,23 @@ def fhost():
sf = None
if "file" in request.files:
return store_file(request.files["file"], request.remote_addr)
try:
# Store the file with the requested expiration date
return store_file(
request.files["file"],
int(request.form["expires"]),
request.remote_addr
)
except ValueError:
# The requested expiration date wasn't properly formed
abort(400)
except KeyError:
# No expiration date was requested, store with the max lifespan
return store_file(
request.files["file"],
None,
request.remote_addr
)
elif "url" in request.form:
return store_url(request.form["url"], request.remote_addr)
elif "shorten" in request.form:
@ -364,3 +439,73 @@ def ehandler(e):
return render_template(f"{e.code}.html", id=id), e.code
except TemplateNotFound:
return "Segmentation fault\n", e.code
@app.cli.command("prune")
def prune():
"""
Clean up expired files
Deletes any files from the filesystem which have hit their expiration time. This
doesn't remove them from the database, only from the filesystem. It's recommended
that server owners run this command regularly, or set it up on a timer.
"""
current_time = time.time() * 1000;
# The path to where uploaded files are stored
storage = Path(app.config["FHOST_STORAGE_PATH"])
# A list of all files who've passed their expiration times
expired_files = File.query\
.where(
and_(
File.expiration.is_not(None),
File.expiration < current_time
)
)
files_removed = 0;
# For every expired file...
for file in expired_files:
# Log the file we're about to remove
file_name = file.getname()
file_hash = file.sha256
file_path = storage / file_hash
print(f"Removing expired file {file_name} [{file_hash}]")
# Remove it from the file system
try:
os.remove(file_path)
files_removed += 1;
except FileNotFoundError:
pass # If the file was already gone, we're good
except OSError as e:
print(e)
print(
"\n------------------------------------"
"Encountered an error while trying to remove file {file_path}. Double"
"check to make sure the server is configured correctly, permissions are"
"okay, and everything is ship shape, then try again.")
return;
# Finally, mark that the file was removed
file.expiration = None;
db.session.commit()
print(f"\nDone! {files_removed} file(s) removed")
""" For a file of a given size, determine the largest allowed lifespan of that file
Based on the current app's configuration: Specifically, the MAX_CONTENT_LENGTH, as well
as FHOST_{MIN,MAX}_EXPIRATION.
This lifespan may be shortened by a user's request, but no files should be allowed to
expire at a point after this number.
Value returned is a duration in milliseconds.
"""
def get_max_lifespan(filesize: int) -> int:
min_exp = app.config.get("FHOST_MIN_EXPIRATION", 30 * 24 * 60 * 60 * 1000)
max_exp = app.config.get("FHOST_MAX_EXPIRATION", 365 * 24 * 60 * 60 * 1000)
max_size = app.config.get("MAX_CONTENT_LENGTH", 256 * 1024 * 1024)
return min_exp + int((-max_exp + min_exp) * (filesize / max_size - 1) ** 3)

View File

@ -45,6 +45,19 @@ MAX_CONTENT_LENGTH = 256 * 1024 * 1024 # Default: 256MiB
MAX_URL_LENGTH = 4096
# The minimum and maximum amount of time we'll retain a file for
#
# Small files (nearing zero bytes) are stored for the longest possible expiration date,
# while larger files (nearing MAX_CONTENT_LENGTH bytes) are stored for the shortest amount
# of time. Values between these two extremes are interpolated with an exponential curve,
# like the one shown on the index page.
#
# All times are in milliseconds. If you want all files to be stored for the same amount
# of time, set these to the same value.
FHOST_MIN_EXPIRATION = 30 * 24 * 60 * 60 * 1000
FHOST_MAX_EXPIRATION = 365 * 24 * 60 * 60 * 1000
# Use the X-SENDFILE header to speed up serving files w/ compatible webservers
#
# Some webservers can be configured use the X-Sendfile header to handle sending

View File

@ -0,0 +1,81 @@
"""add file expirations
Revision ID: 939a08e1d6e5
Revises: 7e246705da6a
Create Date: 2022-11-22 12:16:32.517184
"""
# revision identifiers, used by Alembic.
revision = '939a08e1d6e5'
down_revision = '7e246705da6a'
from alembic import op
from flask import current_app
from flask_sqlalchemy import SQLAlchemy
from pathlib import Path
import sqlalchemy as sa
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
import os
import time
""" For a file of a given size, determine the largest allowed lifespan of that file
Based on the current app's configuration: Specifically, the MAX_CONTENT_LENGTH, as well
as FHOST_{MIN,MAX}_EXPIRATION.
This lifespan may be shortened by a user's request, but no files should be allowed to
expire at a point after this number.
Value returned is a duration in milliseconds.
"""
def get_max_lifespan(filesize: int) -> int:
min_exp = current_app.config.get("FHOST_MIN_EXPIRATION", 30 * 24 * 60 * 60 * 1000)
max_exp = current_app.config.get("FHOST_MAX_EXPIRATION", 365 * 24 * 60 * 60 * 1000)
max_size = current_app.config.get("MAX_CONTENT_LENGTH", 256 * 1024 * 1024)
return min_exp + int((-max_exp + min_exp) * (filesize / max_size - 1) ** 3)
Base = automap_base()
def upgrade():
op.add_column('file', sa.Column('expiration', sa.BigInteger()))
bind = op.get_bind()
Base.prepare(autoload_with=bind)
File = Base.classes.file
session = Session(bind=bind)
storage = Path(current_app.config["FHOST_STORAGE_PATH"])
current_time = time.time() * 1000;
# List of file hashes which have not expired yet
# This could get really big for some servers
try:
unexpired_files = os.listdir(storage)
except FileNotFoundError:
return # There are no currently unexpired files
# Calculate an expiration date for all existing files
files = session.scalars(
sa.select(File)
.where(
sa.not_(File.removed),
File.sha256.in_(unexpired_files)
)
)
updates = [] # We coalesce updates to the database here
for file in files:
file_path = storage / file.sha256
stat = os.stat(file_path)
max_age = get_max_lifespan(stat.st_size) # How long the file is allowed to live, in ms
file_birth = stat.st_mtime * 1000 # When the file was created, in ms
updates.append({'id': file.id, 'expiration': int(file_birth + max_age)})
# Apply coalesced updates
session.bulk_update_mappings(File, updates)
session.commit()
def downgrade():
op.drop_column('file', 'expiration')

View File

@ -11,6 +11,15 @@ Or you can shorten URLs:
File URLs are valid for at least 30 days and up to a year (see below).
Shortened URLs do not expire.
Files can be set to expire sooner by adding an "expires" parameter (in hours)
curl -F'file=@yourfile.png' -F'expires=24' {{ fhost_url }}
OR by setting "expires" to a timestamp in epoch milliseconds
curl -F'file=@yourfile.png' -F'expires=1681996320000' {{ fhost_url }}
Expired files won't be removed immediately, but will be removed as part of
the next purge.
{% set max_size = config["MAX_CONTENT_LENGTH"]|filesizeformat(True) %}
Maximum file size: {{ max_size }}
Not allowed: {{ config["FHOST_MIME_BLACKLIST"]|join(", ") }}
@ -22,7 +31,7 @@ FILE RETENTION PERIOD
retention = min_age + (-max_age + min_age) * pow((file_size / max_size - 1), 3)
days
365 | \\
{{'{: 6}'.format(config.get("FHOST_MAX_EXPIRATION", 31536000000)//86400000)}} | \\
| \\
| \\
| \\
@ -30,7 +39,7 @@ retention = min_age + (-max_age + min_age) * pow((file_size / max_size - 1), 3)
| \\
| ..
| \\
197.5 | ----------..-------------------------------------------
{{'{: 6.1f}'.format((config.get("FHOST_MIN_EXPIRATION", 2592000000)/2 + config.get("FHOST_MAX_EXPIRATION", 31536000000)/2)/86400000)}} | ----------..-------------------------------------------
| ..
| \\
| ..
@ -39,7 +48,7 @@ retention = min_age + (-max_age + min_age) * pow((file_size / max_size - 1), 3)
| ...
| ....
| ......
30 | ....................
{{'{: 6}'.format(config.get("FHOST_MIN_EXPIRATION", 2592000000)//86400000)}} | ....................
0{{ ((config["MAX_CONTENT_LENGTH"]/2)|filesizeformat(True)).split(" ")[0].rjust(27) }}{{ max_size.split(" ")[0].rjust(27) }}
{{ max_size.split(" ")[1].rjust(54) }}
</pre>