From 50307575b2f6dacd59a6b237a81140fcd34ed270 Mon Sep 17 00:00:00 2001 From: caffeinelucy Date: Fri, 26 Jul 2024 12:04:59 +0200 Subject: [PATCH] add code --- video2anki.py | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 video2anki.py diff --git a/video2anki.py b/video2anki.py new file mode 100644 index 0000000..7a1ea43 --- /dev/null +++ b/video2anki.py @@ -0,0 +1,205 @@ +# GNU AFFERO GENERAL PUBLIC LICENSE v3.0 (see LICENSE or https://www.gnu.org/licenses/agpl-3.0.txt) + +import pysrt, sys, getopt, numbers, genanki, random, tempfile, shutil, traceback, datetime, subprocess, multiprocessing, html +from pathlib import Path +from tqdm import tqdm +from subprocess import Popen, PIPE + +TrackA = 0 +TrackB = 0 +Media = "" +CJK = "none" +Encoding = "iso-8859-1" +Title = "Unknown Title" + +TMP = tempfile.mkdtemp()+"/" +#TMP = "TMP/" + +AnkiModel = genanki.Model(random.randrange(1 << 30, 1 << 31), + 'Video2Anki With Media', + fields = [{'name' : 'Question'}, + {'name' : 'Answer'}, + {'name' : 'Voiceline'}], + templates = [{'name' : 'Card', + 'qfmt' : '{{Question}}
<{{Voiceline}}', + 'afmt' : '{{FrontSide}}
{{Answer}}'}]) + +def parse_args(): + opt = "m:a:b:c:e:t:" + lopt = ["media-file=", "track-a=", "track-b=", "cjk-mode=", "encoding=", "title="] + args, vals = getopt.getopt(sys.argv[1:], opt, lopt) + try: + for arg, val in args: + print("<"+arg+"> -> "+val) + if arg in ("-m", "--media-file"): + global Media + Media = val + print("media set to ", Media) + elif arg in ("-a", "--track-a"): + global TrackA + TrackA = val + elif arg in ("-b", "--track-b"): + global TrackB + TrackB = val + elif arg in ("-e", "--encoding"): + global Encoding + Encoding = val + elif arg in ("-t", "--title"): + global Title + Title = val + elif arg in ("-c", "--cjk-mode"): + if val in ("c", "j", "k"): + global CJK + CJK = val + else: + raise Exception('invalid cjk value: ', val) + except getopt.error as err: + print(str(err)) + +def to_seconds(t): + s = t.hour*60*60 + t.minute*60 + t.second + m = int(t.microsecond/1000) + return str(s)+'.'+str(m)[0] + +def to_duration(start, end): + t = start + sa = t.hour*60*60 + t.minute*60 + t.second + ma = int(t.microsecond/1000) + t = end + se = t.hour*60*60 + t.minute*60 + t.second + me = int(t.microsecond/1000) + s = se-sa + m = me-ma + if m < 0: + s = s - 1 + m = 1000-m + return str(s)+'.'+str(m)[0] + +def extract_audio(name, start, end): + cmds = ["ffmpeg", "-y", "-threads", str(multiprocessing.cpu_count()), "-i", Media, "-ss", to_seconds(start), "-t", to_duration(start, end), name] + cmd = "" + for c in cmds: + cmd = cmd + str(c) + " " + subprocess.check_call(cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + shell=True) + +def extract_track(idx): + cmds = ["ffmpeg", "-i", Media, "-map=0:s:"+str(idx), TMP+Media+"_"+str(idx)+".srt"] + cmd = "" + for c in cmds: + cmd = cmd + str(c) + " " + subprocess.check_call(cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + shell=True) + return TMP+Media+"_"+idx+".srt" + +def get_track(track): + if isinstance(track, numbers.Number): + track = extract_srt(track) + return pysrt.open(track) + +def overlaps(start_a, end_a, start_b, end_b): + start_c = start_b if start_a < start_b else start_a + end_c = end_b if end_a > end_b else end_b + return start_c < end_c + +def apply_cjk(original): + text = "" + lookup = "" + if CJK=="c": + lookup = "https://strokeorder.com/chinese/" + elif CJK=="j": + lookup = "https://japandict.com/kanji/" + elif CJK=="k": + lookup = "https://koreanhanja.app/" + else: + return html.escape(original) + for p in original: + escaped_p = html.escape(p) + if is_cjk(p) and lookup!="": + text = text + "" + escaped_p + "" + else: + text = text + escaped_p + return text + +def is_cjk(p): + cjk_ranges = [ + ( 0x4E00, 0x62FF), + ( 0x6300, 0x77FF), + ( 0x7800, 0x8CFF), + ( 0x8D00, 0x9FCC), + ( 0x3400, 0x4DB5), + (0x20000, 0x215FF), + (0x21600, 0x230FF), + (0x23100, 0x245FF), + (0x24600, 0x260FF), + (0x26100, 0x275FF), + (0x27600, 0x290FF), + (0x29100, 0x2A6DF), + (0x2A700, 0x2B734), + (0x2B740, 0x2B81D), + (0x2B820, 0x2CEAF), + (0x2CEB0, 0x2EBEF), + (0x2F800, 0x2FA1F) + ] + cp = ord(p) + for bottom, top in cjk_ranges: + if cp >= bottom and cp <= top: + return True + return False + +def match_tracks(a, b): + matched = [] + media = [] + matchbar = tqdm(total=len(a) if len(a) > len(b) else len(b)) + for sub_a in a: + start_a = sub_a.start.to_time() + end_a = sub_a.end.to_time() + for sub_b in b: + start_b = sub_b.start.to_time() + end_b = sub_b.end.to_time() + if start_b > end_a: + # skip a, b is past a + break + if end_b < start_a: + # skip b, a is past b + matchbar.update(1) + continue + elif overlaps(start_a, end_a, start_b, end_b): + # match + audio_name = str(len(matched)) + ".opus" + media.append(TMP + audio_name) + extract_audio(TMP + audio_name, start_a, end_a) + text_b = apply_cjk(html.escape(sub_b.text)) + matched.append(genanki.Note(model=AnkiModel, fields=[text_b, html.escape(sub_a.text), "[sound:"+audio_name+"]"])) + matchbar.update(1) + matchbar.close() + return matched, media + +try: + parse_args() + if Path(Media).is_file()==False: + raise Exception('file <', Media, '> does not exist') + global AnkiDeck + AnkiDeck = genanki.Deck(random.randrange(1 << 30, 1 << 31),Title) + subs_a = get_track(TrackA) + subs_b = get_track(TrackB) + print("matching subtitles of both tracks...") + notes, media = match_tracks(subs_a, subs_b) + for note in notes: + AnkiDeck.add_note(note) + print("generating anki deck") + package = genanki.Package(AnkiDeck) + package.media_files = media + print("writing apkg file...") + package.write_to_file(Title+".apkg") + print("done") +except Exception as e: + print(traceback.format_exc()) +except subprocess.CalledProcessError as e: + print(e.output) + +shutil.rmtree(TMP)