Source code for mkv_episode_matcher.mkv_to_srt
import os
import subprocess
import sys
# Get the absolute path of the parent directory of the current script.
parent_dir = os.path.dirname(os.path.abspath(__file__))
# Add the parent directory to the Python path.
sys.path.append(parent_dir)
# Add the 'libraries' directory to the Python path.
sys.path.append(os.path.join(parent_dir, "libraries"))
# Add the 'libraries' directory to the Python path.
sys.path.append(os.path.join(parent_dir, "..", "libraries", "pgs2srt"))
import pytesseract
import re
from PIL import Image, ImageOps
from mkv_episode_matcher.__main__ import CONFIG_FILE
from mkv_episode_matcher.config import get_config
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
from pgsreader import PGSReader
from imagemaker import make_image
from loguru import logger
[docs]def convert_mkv_to_sup(mkv_file, output_dir):
"""
Convert an .mkv file to a .sup file using FFmpeg and pgs2srt.
Args:
mkv_file (str): Path to the .mkv file.
output_dir (str): Path to the directory where the .sup file will be saved.
Returns:
str: Path to the converted .sup file.
"""
# Get the base name of the .mkv file without the extension
base_name = os.path.splitext(os.path.basename(mkv_file))[0]
# Construct the output .sup file path
sup_file = os.path.join(output_dir, f"{base_name}.sup")
if not os.path.exists(sup_file):
logger.info(f"Processing {mkv_file} to {sup_file}")
# FFmpeg command to convert .mkv to .sup
ffmpeg_cmd = ["ffmpeg", "-i", mkv_file, "-map", "0:s:0", "-c", "copy", sup_file]
try:
subprocess.run(ffmpeg_cmd, check=True)
logger.info(f"Converted {mkv_file} to {sup_file}")
except subprocess.CalledProcessError as e:
logger.error(f"Error converting {mkv_file}: {e}")
else:
logger.info(f"File {sup_file} already exists, skipping")
return sup_file
[docs]@logger.catch
def perform_ocr(sup_file_path):
"""
Perform OCR on a .sup file and save the extracted text to a .srt file.
Args:
sup_file_path (str): Path to the .sup file.
"""
# Get the base name of the .sup file without the extension
base_name = os.path.splitext(os.path.basename(sup_file_path))[0]
output_dir = os.path.dirname(sup_file_path)
logger.info(f"Performing OCR on {sup_file_path}")
# Construct the output .srt file path
srt_file = os.path.join(output_dir, f"{base_name}.srt")
# Load a PGS/SUP file.
pgs = PGSReader(sup_file_path)
# Set index
i = 0
# Complete subtitle track index
si = 0
tesseract_lang = "eng"
tesseract_config = "-c tessedit_char_blacklist=[] --psm 6 --oem {}".format(1)
config = get_config(CONFIG_FILE)
tesseract_path = config.get("tesseract_path")
logger.debug(f"Setting Teesseract Path to {tesseract_path}")
pytesseract.pytesseract.tesseract_cmd = str(tesseract_path)
# SubRip output
output = ""
if not os.path.exists(srt_file):
# Iterate the pgs generator
for ds in pgs.iter_displaysets():
# If set has image, parse the image
if ds.has_image:
# Get Palette Display Segment
pds = ds.pds[0]
# Get Object Display Segment
ods = ds.ods[0]
if pds and ods:
# Create and show the bitmap image and convert it to RGBA
src = make_image(ods, pds).convert("RGBA")
# Create grayscale image with black background
img = Image.new("L", src.size, "BLACK")
# Paste the subtitle bitmap
img.paste(src, (0, 0), src)
# Invert images so the text is readable by Tesseract
img = ImageOps.invert(img)
# Parse the image with tesesract
text = pytesseract.image_to_string(
img, lang=tesseract_lang, config=tesseract_config
).strip()
# Replace "|" with "I"
# Works better than blacklisting "|" in Tesseract,
# which results in I becoming "!" "i" and "1"
text = re.sub(r"[|/\\]", "I", text)
text = re.sub(r"[_]", "L", text)
start = datetime.fromtimestamp(ods.presentation_timestamp / 1000)
start = start + timedelta(hours=-1)
else:
# Get Presentation Composition Segment
pcs = ds.pcs[0]
if pcs:
end = datetime.fromtimestamp(pcs.presentation_timestamp / 1000)
end = end + timedelta(hours=-1)
if (
isinstance(start, datetime)
and isinstance(end, datetime)
and len(text)
):
si = si + 1
sub_output = str(si) + "\n"
sub_output += (
start.strftime("%H:%M:%S,%f")[0:12]
+ " --> "
+ end.strftime("%H:%M:%S,%f")[0:12]
+ "\n"
)
sub_output += text + "\n\n"
output += sub_output
start = end = text = None
i = i + 1
with open(srt_file, "w") as f:
f.write(output)
logger.info(f"Saved to: {srt_file}")
[docs]def convert_mkv_to_srt(season_path, mkv_files):
"""
Converts MKV files to SRT format.
Args:
season_path (str): The path to the season directory.
mkv_files (list): List of MKV files to convert.
Returns:
None
"""
logger.info(f"Converting {len(mkv_files)} files to SRT")
output_dir = os.path.join(season_path, "ocr")
os.makedirs(output_dir, exist_ok=True)
sup_files = []
for mkv_file in mkv_files:
sup_file = convert_mkv_to_sup(mkv_file, output_dir)
sup_files.append(sup_file)
with ThreadPoolExecutor() as executor:
for sup_file in sup_files:
executor.submit(perform_ocr, sup_file)