add code

2024-07-26 12:04:59 +02:00 · 2024-07-26 12:04:59 +02:00 · 50307575b2
commit 50307575b2
parent 6a9e7d8764
1 changed files with 205 additions and 0 deletions
--- a/video2anki.py
+++ b/video2anki.py
@ -0,0 +1,205 @@
+# GNU AFFERO GENERAL PUBLIC LICENSE v3.0 (see LICENSE or https://www.gnu.org/licenses/agpl-3.0.txt)
+
+import pysrt, sys, getopt, numbers, genanki, random, tempfile, shutil, traceback, datetime, subprocess, multiprocessing, html
+from pathlib import Path
+from tqdm import tqdm
+from subprocess import Popen, PIPE
+
+TrackA = 0
+TrackB = 0
+Media = ""
+CJK = "none"
+Encoding = "iso-8859-1"
+Title = "Unknown Title"
+
+TMP = tempfile.mkdtemp()+"/"
+#TMP = "TMP/"
+
+AnkiModel = genanki.Model(random.randrange(1 << 30, 1 << 31),
+                         'Video2Anki With Media',
+                         fields = [{'name' : 'Question'},
+                                   {'name' : 'Answer'},
+                                   {'name' : 'Voiceline'}],
+                         templates = [{'name' : 'Card',
+                                       'qfmt' : '{{Question}}<br><{{Voiceline}}',
+                                       'afmt' : '{{FrontSide}}<hr id="answer">{{Answer}}'}])
+
+def parse_args():
+    opt = "m:a:b:c:e:t:"
+    lopt = ["media-file=", "track-a=", "track-b=", "cjk-mode=", "encoding=", "title="]
+    args, vals = getopt.getopt(sys.argv[1:], opt, lopt)
+    try:
+        for arg, val in args:
+            print("<"+arg+"> -> "+val)
+            if arg in ("-m", "--media-file"):
+                global Media
+                Media = val
+                print("media set to ", Media)
+            elif arg in ("-a", "--track-a"):
+                global TrackA
+                TrackA = val
+            elif arg in ("-b", "--track-b"):
+                global TrackB
+                TrackB = val
+            elif arg in ("-e", "--encoding"):
+                global Encoding
+                Encoding = val
+            elif arg in ("-t", "--title"):
+                global Title
+                Title = val
+            elif arg in ("-c", "--cjk-mode"):
+                if val in ("c", "j", "k"):
+                    global CJK
+                    CJK = val
+                else:
+                    raise Exception('invalid cjk value: ', val)
+    except getopt.error as err:
+        print(str(err))
+
+def to_seconds(t):
+    s = t.hour*60*60 + t.minute*60 + t.second
+    m = int(t.microsecond/1000)
+    return str(s)+'.'+str(m)[0]
+
+def to_duration(start, end):
+    t = start
+    sa = t.hour*60*60 + t.minute*60 + t.second
+    ma = int(t.microsecond/1000)
+    t = end
+    se = t.hour*60*60 + t.minute*60 + t.second
+    me = int(t.microsecond/1000)
+    s = se-sa
+    m = me-ma
+    if m < 0:
+        s = s - 1
+        m = 1000-m
+    return str(s)+'.'+str(m)[0]
+
+def extract_audio(name, start, end):
+    cmds = ["ffmpeg", "-y", "-threads", str(multiprocessing.cpu_count()), "-i", Media, "-ss", to_seconds(start), "-t", to_duration(start, end), name]
+    cmd = ""
+    for c in cmds:
+        cmd = cmd + str(c) + " "
+    subprocess.check_call(cmd,
+                          stdout=subprocess.DEVNULL,
+                          stderr=subprocess.STDOUT,
+                          shell=True)
+
+def extract_track(idx):
+    cmds = ["ffmpeg", "-i", Media, "-map=0:s:"+str(idx), TMP+Media+"_"+str(idx)+".srt"]
+    cmd = ""
+    for c in cmds:
+        cmd = cmd + str(c) + " "
+    subprocess.check_call(cmd,
+                          stdout=subprocess.DEVNULL,
+                          stderr=subprocess.STDOUT,
+                          shell=True)
+    return TMP+Media+"_"+idx+".srt"
+
+def get_track(track):
+    if isinstance(track, numbers.Number):
+        track = extract_srt(track)
+    return pysrt.open(track)
+
+def overlaps(start_a, end_a, start_b, end_b):
+    start_c = start_b if start_a < start_b else start_a
+    end_c = end_b if end_a > end_b else end_b
+    return start_c < end_c
+
+def apply_cjk(original):
+    text = ""
+    lookup = ""
+    if CJK=="c":
+        lookup = "https://strokeorder.com/chinese/"
+    elif CJK=="j":
+        lookup = "https://japandict.com/kanji/"
+    elif CJK=="k":
+        lookup = "https://koreanhanja.app/"
+    else:
+        return html.escape(original)
+    for p in original:
+        escaped_p = html.escape(p)
+        if is_cjk(p) and lookup!="":
+            text = text + "<a href=\"" + lookup + escaped_p + "\">" + escaped_p + "</a>"
+        else:
+            text = text + escaped_p
+    return text
+
+def is_cjk(p):
+    cjk_ranges = [
+        ( 0x4E00,  0x62FF),
+        ( 0x6300,  0x77FF),
+        ( 0x7800,  0x8CFF),
+        ( 0x8D00,  0x9FCC),
+        ( 0x3400,  0x4DB5),
+        (0x20000, 0x215FF),
+        (0x21600, 0x230FF),
+        (0x23100, 0x245FF),
+        (0x24600, 0x260FF),
+        (0x26100, 0x275FF),
+        (0x27600, 0x290FF),
+        (0x29100, 0x2A6DF),
+        (0x2A700, 0x2B734),
+        (0x2B740, 0x2B81D),
+        (0x2B820, 0x2CEAF),
+        (0x2CEB0, 0x2EBEF),
+        (0x2F800, 0x2FA1F)
+    ]
+    cp = ord(p)
+    for bottom, top in cjk_ranges:
+        if cp >= bottom and cp <= top:
+            return True
+    return False
+
+def match_tracks(a, b):
+    matched = []
+    media = []
+    matchbar = tqdm(total=len(a) if len(a) > len(b) else len(b))
+    for sub_a in a:
+        start_a = sub_a.start.to_time()
+        end_a = sub_a.end.to_time()
+        for sub_b in b:
+            start_b = sub_b.start.to_time()
+            end_b = sub_b.end.to_time()
+            if start_b > end_a:
+                # skip a, b is past a
+                break
+            if end_b < start_a:
+                # skip b, a is past b
+                matchbar.update(1)
+                continue
+            elif overlaps(start_a, end_a, start_b, end_b):
+                # match
+                audio_name = str(len(matched)) + ".opus"
+                media.append(TMP + audio_name)
+                extract_audio(TMP + audio_name, start_a, end_a)
+                text_b = apply_cjk(html.escape(sub_b.text))
+                matched.append(genanki.Note(model=AnkiModel, fields=[text_b, html.escape(sub_a.text), "[sound:"+audio_name+"]"]))
+        matchbar.update(1)
+    matchbar.close()
+    return matched, media
+
+try:
+    parse_args()
+    if Path(Media).is_file()==False:
+        raise Exception('file <', Media, '> does not exist')
+    global AnkiDeck
+    AnkiDeck = genanki.Deck(random.randrange(1 << 30, 1 << 31),Title)
+    subs_a = get_track(TrackA)
+    subs_b = get_track(TrackB)
+    print("matching subtitles of both tracks...")
+    notes, media = match_tracks(subs_a, subs_b)
+    for note in notes:
+        AnkiDeck.add_note(note)
+    print("generating anki deck")
+    package = genanki.Package(AnkiDeck)
+    package.media_files = media
+    print("writing apkg file...")
+    package.write_to_file(Title+".apkg")
+    print("done")
+except Exception as e:
+    print(traceback.format_exc())
+except subprocess.CalledProcessError as e:
+    print(e.output)
+
+shutil.rmtree(TMP)