我尝试过直接利用 API 导出转换但是很遗憾,一个 API 一小时的次数实际上迁移这 200 多篇文档居然不够用,直接坠机,然后看到可以导出.lakebook,再转成 Markdown。yuque2markdown 这个工具就是干这个的
git clone https://github.com/alswl/yuque2markdown.git
cd yuque2markdown && \
pip3 install -r requirements.txt
把所有.lakebook下载到 download 目录下之后直接处理,
find .. -maxdepth 1 -type f -name "*.lakebook" -print0 | while IFS= read -r -d $'\0' f; do name="$(basename "$f" .lakebook)"; mkdir -p "$HOME/My_Vault/$name"; python3 ./yuque2markdown.py "$f" "$HOME/My_Vault/$name" --download-image || echo "[WARN] 失败: $f"; done
有一点小 bug 进行一下修复,比如图片无法下载,有一些情况作者没考虑到,还有就是我想要把所有图片用作 png 来管理,所以还加了一个强转,最后的脚本如下
# coding=utf-8
import json
import os
import random
import shutil
import sys
import argparse
import tarfile
from base64 import b64decode
from binascii import Error as BinasciiError
from io import BytesIO
from urllib.parse import unquote, unquote_to_bytes, urlparse
from markdownify import markdownify as md
from bs4 import BeautifulSoup
from PIL import Image, UnidentifiedImageError
from requests import get
from requests.exceptions import RequestException
import yaml
import tempfile
TYPE_TITLE = "TITLE"
TYPE_DOC = "DOC"
META_JSON = "$meta.json"
TMP_DIR = tempfile.gettempdir()
DEFAULT_HEADING_STYLE = "ATX"
CONTENT_TYPE_TO_EXTENSION = {
"image/gif": ".gif",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/svg+xml": ".svg",
"image/png": ".png",
"image/webp": ".webp",
}
CONVERT_TO_PNG_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp"}
def sanitizer_file_name(name):
name = name.replace("/", "_")
name = name.replace("\\", "_")
name = name.replace(" ", "_")
name = name.replace("?", "_")
name = name.replace("*", "_")
name = name.replace("<", "_")
name = name.replace(">", "_")
name = name.replace("|", "_")
name = name.replace('"', "_")
name = name.replace(":", "_")
return name
def read_toc(random_tmp_dir):
# open meta json
f = open(os.path.join(random_tmp_dir, META_JSON), "r", encoding="utf-8")
meta_file_str = json.loads(f.read())
meta_str = meta_file_str.get("meta", "")
meta = json.loads(meta_str)
toc_str = meta.get("book", {}).get("tocYml", "")
toc = yaml.unsafe_load(toc_str)
f.close()
return toc
def extract_repos(repo_dir, output, toc, download_image):
last_level = 0
last_sanitized_title = ""
path_prefixed = []
for item in toc:
t = item["type"]
url = str(item.get("url", ""))
current_level = item.get("level", 0)
title = str(item.get("title", ""))
sanitized_title = sanitizer_file_name(str(title))
if not title:
continue
while True:
if os.path.exists(os.path.join(output, sanitized_title)):
sanitized_title = sanitizer_file_name(str(title)) + str(
random.randint(0, 1000)
)
break
if current_level > last_level:
path_prefixed = path_prefixed + [last_sanitized_title]
elif current_level < last_level:
diff = last_level - current_level
path_prefixed = path_prefixed[0:-diff]
# else:
if t == TYPE_DOC:
output_dir_path = os.path.join(output, *path_prefixed)
if not os.path.exists(output_dir_path):
os.makedirs(output_dir_path)
article_dir_path = os.path.join(output_dir_path, sanitized_title)
if not os.path.exists(article_dir_path):
os.makedirs(article_dir_path)
raw_path = os.path.join(repo_dir, url + ".json")
raw_file = open(raw_path, "r", encoding="utf-8")
doc_str = json.loads(raw_file.read())
html = doc_str["doc"]["body"] or doc_str["doc"]["body_asl"]
if download_image:
html = download_images_and_patch_html(
repo_dir, article_dir_path, html
)
output_path = os.path.join(article_dir_path, sanitized_title + ".md")
f = open(output_path, "w", encoding="utf-8")
f.write(pretty_md(md(html, heading_style=DEFAULT_HEADING_STYLE)))
last_sanitized_title = sanitized_title
last_level = current_level
def download_images_and_patch_html(repo_dir, output_dir_path, html):
bs = BeautifulSoup(html, "html.parser")
if len(bs.find_all("img")) > 0:
attachments_dir_path = os.path.join(output_dir_path, "assets")
if not os.path.exists(attachments_dir_path):
os.mkdir(attachments_dir_path)
no = 1
for image in bs.find_all("img"):
src = image.get("src", "")
if not src:
continue
parsed_src = urlparse(src)
png_file_name = "%03d.png" % no
attachments_file_path = os.path.join(attachments_dir_path, png_file_name)
if parsed_src.scheme in ("http", "https") or src.startswith("//"):
url = "https:" + src if src.startswith("//") else src
print("Download %s" % src)
try:
resp = get(url, timeout=30)
resp.raise_for_status()
except RequestException as e:
print("Skip image %s: %s" % (src, e))
continue
content_type = resp.headers.get("Content-Type", "")
if should_convert_to_png(src, content_type) and save_image_as_png(
BytesIO(resp.content), attachments_file_path
):
file_name = png_file_name
else:
file_name = save_original_image(
resp.content, attachments_dir_path, no, src, content_type
)
elif parsed_src.scheme == "data":
data_uri = parse_data_uri(src)
if not data_uri:
print("Skip image %s: invalid data URI" % src[:80])
continue
content_type, image_data = data_uri
if should_convert_to_png(src, content_type) and save_image_as_png(
BytesIO(image_data), attachments_file_path
):
file_name = png_file_name
else:
file_name = save_original_image(
image_data, attachments_dir_path, no, src, content_type
)
elif parsed_src.scheme:
print("Skip image %s: unsupported scheme" % src)
continue
else:
src_path = unquote(parsed_src.path).lstrip("/")
local_image_path = os.path.abspath(os.path.join(repo_dir, src_path))
repo_dir_abs = os.path.abspath(repo_dir)
if not local_image_path.startswith(repo_dir_abs + os.sep) or not os.path.isfile(
local_image_path
):
print("Remove image %s: local file not found" % src)
image.decompose()
continue
if should_convert_to_png(src) and save_image_as_png(
local_image_path, attachments_file_path
):
file_name = png_file_name
else:
file_name = save_original_file(
local_image_path, attachments_dir_path, no, src
)
no = no + 1
image["src"] = "./assets/" + file_name
html = str(bs)
return html
else:
return html
def should_convert_to_png(src, content_type=""):
extension = image_extension(src, content_type, "")
return extension == "" or extension in CONVERT_TO_PNG_EXTENSIONS
def image_extension(src, content_type="", default=".img"):
content_type = content_type.split(";", 1)[0].strip().lower()
if content_type in CONTENT_TYPE_TO_EXTENSION:
return CONTENT_TYPE_TO_EXTENSION[content_type]
extension = os.path.splitext(unquote(urlparse(src).path))[1].lower()
if extension:
return extension
return default
def save_original_image(image_data, attachments_dir_path, no, src, content_type=""):
file_name = "%03d%s" % (no, image_extension(src, content_type))
attachments_file_path = os.path.join(attachments_dir_path, file_name)
with open(attachments_file_path, "wb") as f:
f.write(image_data)
return file_name
def save_original_file(local_image_path, attachments_dir_path, no, src):
file_name = "%03d%s" % (no, image_extension(src))
attachments_file_path = os.path.join(attachments_dir_path, file_name)
shutil.copyfile(local_image_path, attachments_file_path)
return file_name
def parse_data_uri(src):
try:
header, image_data = src.split(",", 1)
except ValueError:
return None
if not header.startswith("data:"):
return None
parts = header[5:].split(";")
content_type = parts[0] or "text/plain"
try:
if "base64" in parts[1:]:
image_data = b64decode(image_data)
else:
image_data = unquote_to_bytes(image_data)
except (BinasciiError, ValueError):
return None
return content_type, image_data
def save_image_as_png(image_file, output_path):
try:
with Image.open(image_file) as image:
if image.mode in ("RGBA", "LA"):
output = image
elif image.mode == "P" and "transparency" in image.info:
output = image.convert("RGBA")
else:
output = image.convert("RGB")
output.save(output_path, "PNG")
except (OSError, UnidentifiedImageError):
return False
return True
def pretty_md(text: str) -> str:
output = text
lines = output.split("\n")
for i in range(len(lines)):
lines[i] = lines[i].rstrip()
output = "\n".join(lines)
for i in range(50):
output = output.replace("\n\n\n", "\n\n")
if "\n\n\n" not in output:
break
return output
def main():
parser = argparse.ArgumentParser(description="Convert Yuque doc to markdown")
parser.add_argument("lakebook", help="Lakebook file")
parser.add_argument("output", help="Output directory")
parser.add_argument(
"--download-image", help="Download images to local", action="store_true"
)
args = parser.parse_args()
if not os.path.exists(args.lakebook):
print("Lakebook file not found: " + args.lakebook)
sys.exit(1)
if not os.path.exists(args.output):
os.mkdir(args.output)
# extract lakebook file
random_tmp_dir = os.path.join(TMP_DIR, "lakebook_" + str(os.getpid()))
extract_tar(args.lakebook, random_tmp_dir)
# detect only one directory in random_tmp_dir
repo_dir = ""
for root, dirs, files in os.walk(random_tmp_dir):
for d in dirs:
repo_dir = os.path.join(random_tmp_dir, d)
break
break
if not repo_dir:
print(".lakebook file is invalid")
sys.exit(1)
toc = read_toc(repo_dir)
# print len of toc
print("Total " + str(len(toc)) + " files")
extract_repos(repo_dir, args.output, toc, args.download_image)
# remove tmp dir
shutil.rmtree(random_tmp_dir)
# extract tar file in tar.gz
def extract_tar(tar_file, target_dir):
if not os.path.exists(target_dir):
os.mkdir(target_dir)
tar = tarfile.open(tar_file)
names = tar.getnames()
for name in names:
tar.extract(name, target_dir)
tar.close()
if __name__ == "__main__":
main()
ps: 粘贴脚本必须使用 Cmd + Shift + V 粘贴纯文本