WordPress文章转换成本地Markdown文件—一键备份网站文章

感谢 TurboAI对本博客的的大力赞助。 创作不易,如果您觉得有帮助,请 支持LIncol29! 为了让我能够继续创作更好的内容,你也可以选择订阅博客的 VIP ,包年VIP仅需10元/年,所有VIP内容免费观看

前言

如果想从wordpress转战其他Hugo、Halo等博客平台,上传至其他平台。

或者是想将文章转换成markdow文件,将所有文章备份到本地,防止文章丢失。此时可以使用自动化工具将文章自动转换md文件。

准备流程

  • 在wordpress站点中 导出-选择导出的内容,这里可以自行选择所有内容还是已发布文章。

img

  • 导出的文件为xml格式

img

转换步骤

  1. 下载Git工具
  2. 克隆此githup项目 https://github.com/palaniraja/blog2md
  3. cd到项目目录
  4. 运行 npm install 安装依赖项(只需输入npm install,会自动安装项目所需的依赖项)
  5. 运行 node index.js w your-wordpress-backup-export.xml out your-wordpress-backup-export.xml 修改为自己的xml文件名

上面 第三步 tips

cd 到项目目录 或者 在红框输入 cmd 直接进入命令界面

img

成果图

  • 文章将输出在 out 目录下

img

img

附加改善

如果你不想生成的目录中含有Comment(评论文件),可以将node.js文件的writeComments注释,含有此变量的全部注释即可。

img

禁止md文件生成yaml头信息,注释fileHeader变量的赋值。

img

md文件名改为title,修改为文章标题.md

img

我目前使用的node.js,你可以自行替换。

'use strict';

/***
    Usage: blog2md b|w <BLOGGER/WordPress BACKUP XML> <OUTPUT DIR>

*/


const fs = require('fs');
const os = require('os');
const path = require('path');
const xml2js = require('xml2js');
const sanitize = require('sanitize-filename');
const TurndownService = require('turndown');
var moment = require('moment');

var tds = new TurndownService({ codeBlockStyle: 'fenced', fence: '```' })

tds.addRule('wppreblock', {
    filter: ['pre'],
    replacement: function(content) {
        return '```\n' + content + '\n```'
    }
})

// console.log(`No. of arguments passed: ${process.argv.length}`);

if (process.argv.length < 5){
    // ${process.argv[1]}
    console.log(`Usage: blog2md [b|w] <BACKUP XML> <OUTPUT DIR> m|s`)
    console.log(`\t b for parsing Blogger(Blogspot) backup`);
    console.log(`\t w for parsing WordPress backup`);
    return 1;
}

var option = process.argv[2];
var inputFile =  process.argv[3];

var outputDir = process.argv[4];

var mergeComments = (process.argv[5] == 'm')?'m':'s' ;
/** Apply a fix to WordPress posts to convert newlines to paragraphs. */
var applyParagraphFix = (process.argv.indexOf('paragraph-fix') >= 0);


if (fs.existsSync(outputDir)) {
    console.log(`WARNING: Given output directory "${outputDir}" already exists. Files will be overwritten.`)
}
else{
    fs.mkdirSync(outputDir);
}


if (mergeComments == 'm'){
    console.log(`INFO: Comments requested to be merged along with posts. (m)`);
}
else{
    console.log(`INFO: Comments requested to be a separate .md file(m - default)`);
}



if( option.toLowerCase() == 'b'){
    bloggerImport(inputFile, outputDir);
}
else if(option.toLowerCase() == 'w'){
    wordpressImport(inputFile, outputDir);
}
else {
    console.log('Only b (Blogger) and w (WordPress) are valid options');
    return;
}





function wordpressImport(backupXmlFile, outputDir){
    var parser = new xml2js.Parser();

    fs.readFile(backupXmlFile, function(err, data) {
        parser.parseString(data, function (err, result) {
            if (err) {
                console.log(`Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}`); 
                return 1;
            }
            // console.dir(result); 
            // console.log(JSON.stringify(result)); return;
            var posts = [];

            // try {
                posts = result.rss.channel[0].item;

                console.log(`Total Post count: ${posts.length}`);

                posts = posts.filter(function(post){
                    var status = '';
                    if(post["wp:status"]){
                        status = post["wp:status"].join(''); 
                    }
                    // console.log(post["wp:status"].join(''));
                    return status != "private" && status != "inherit" 
                });


                // console.log(posts)
                console.log(`Post count: ${posts.length}`);

                var title = '';
                var content = '';
                var tags = [];
                var draft = false;
                var published = '';
                var comments = [];
                var fname = '';
                var markdown = '';
                var fileContent = '';
                var fileHeader = '';
                var postMaps = {};

                posts.forEach(function(post){
                    var postMap = {};

                    title = post.title[0].trim();

                    // console.log(title);

                    // if (title && title.indexOf("'")!=-1){
                    title = title.replace(/'/g, "''");
                    // }

                    draft = post["wp:status"] == "draft"
                    published = post.pubDate;
                    comments = post['wp:comment'];
                    // fname = sanitize(decodeURI(post["wp:post_name"][0])) || post["wp:post_id"];
                    fname = title;
                    markdown = '';
                    // if (post.guid && post.guid[0] && post.guid[0]['_']){
                    //     fname = path.basename(post.guid[0]['_']);
                    // }
                    // console.log(comments);

                    console.log(`\n\n\n\ntitle: '${title}'`);
                    console.log(`published: '${published}'`);

                    if (comments){
                        console.log(`comments: '${comments.length}'`);    
                    }

                    tags = [];

                    var categories = post.category;
                    var tagString = '';

                    if (categories && categories.length){
                        categories.forEach(function (category){
                            // console.log(category['_']);
                            tags.push(category['_']);
                        });

                        // console.log(tags.join(", "));
                        // tags = tags.join(", ");
                        tagString = 'tags: [\'' + tags.join("', '") + "']\n";
                        // console.log(tagString);
                    }

                    var pmap = {fname:'', comments:[]};
                    pmap.fname = outputDir+'/'+fname+'-comments.md';

                    fname = outputDir+'/'+fname+'.md';
                    pmap.postName = fname;
                    console.log(`fname: '${fname}'`);

                    if (post["content:encoded"]){
                        // console.log('content available');
                        var postContent = post["content:encoded"].toString();
                        if (applyParagraphFix && !/<p>/i.test(postContent)) {
                            postContent = '<p>' + postContent.replace(/(\r?\n){2}/g, '</p>\n\n<p>') + '</p>';
                        }
                        content = '<div>'+postContent+'</div>'; //to resolve error if plain text returned
                        markdown = tds.turndown(content);
                        // console.log(markdown);

                        //fileHeader = `---\ntitle: '${title}'\ndate: ${published}\ndraft: ${draft}\n${tagString}---\n`;
                        fileContent = `${fileHeader}\n${markdown}`;
                        pmap.header = `${fileHeader}\n`;

                        writeToFile(fname, fileContent);

                    }

                    //comments:
                    /*
                        "wp:comment" [.each]
                            wp:comment_author[0]
                            wp:comment_author_email[0]
                            wp:comment_author_url[0]
                            wp:comment_date[0]
                            wp:comment_content[0]
                            wp:comment_approved[0] == 1
                        wp:post_id

                    */
                    var comments = post["wp:comment"] || [];
                    // console.dir(comments);
                    var anyApprovedComments = 0;
                    var ccontent = '';
                    comments.forEach(function(comment){
                        // console.log('')
                        if(comment["wp:comment_approved"].pop()){
                            anyApprovedComments = 1;

                            var cmt = {title:'', published:'', content:'', author:{}};

                            cmt.published = (comment["wp:comment_date"]?comment["wp:comment_date"].pop():'');

                            var cont = '<div>'+comment["wp:comment_content"].pop()+'</div>';
                            cmt.content = (comment["wp:comment_content"]?tds.turndown(cont):'');

                            cmt.author.name = (comment["wp:comment_author"]?comment["wp:comment_author"].pop():'');
                            cmt.author.email = (comment["wp:comment_author_email"]?comment["wp:comment_author_email"].pop():'');
                            cmt.author.url = (comment["wp:comment_author_url"]?comment["wp:comment_author_url"].pop():'');

                            ccontent += `#### [${cmt.author.name}](${cmt.author.url} "${cmt.author.email}") - ${cmt.published}\n\n${cmt.content}\n<hr />\n`;

                            pmap.comments.push(cmt);
                        }
                    });

                    //just a hack to re-use blogger writecomments method
                    if (pmap && pmap.comments && pmap.comments.length){
                        //writeComments({"0": pmap});
                    }

                });

        });
    });

}




function getFileName(text) {
    var newFileName = sanitize(text)     // first remove any dodgy characters
            .replace(/[\.']/g, '')       // then remove some known characters
            .replace(/[^a-z0-9]/gi, '-') // then turn anything that isn't a number or letter into a hyphen
            .replace(/[\-]{2,}/g, '-')   // then turn multiple hyphens into a single one
            .toLowerCase();              // finally make it all lower case
    return newFileName;
}

function bloggerImport(backupXmlFile, outputDir){
    var parser = new xml2js.Parser();
    // __dirname + '/foo.xml'
    fs.readFile(backupXmlFile, function(err, data) {
        parser.parseString(data, function (err, result) {
            if (err){
                console.log(`Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}`); return 1;
            }
            // console.dir(JSON.stringify(result)); return;

            if(result.feed && result.feed.entry) {
                var contents = result.feed.entry;
                console.log(`Total no. of entries found : ${contents.length}`);
                // var i=0
                var posts = contents.filter(function(entry){
                    return entry.id[0].indexOf('.post-')!=-1 && !entry['thr:in-reply-to']
                });

                var comments = contents.filter(function(entry){
                    return entry.id[0].indexOf('.post-')!=-1 && entry['thr:in-reply-to']
                });

                // console.dir(posts);

                console.log(`Content-posts ${posts.length}`);
                console.log(`Content-Comments ${comments.length}`);

                 var content = '';
                 var markdown = '';
                 var fileContent = '';
                 var fileHeader = '';
                 var postMaps = {};

                posts.forEach(function(entry){
                    var postMap = {};

                    var title = entry.title[0]['_'];
                    // title = tds.turndown(title);
                    if (title && title.indexOf("'")!=-1){
                         title = title.replace(/'/g, "''");
                    }
                    postMap.pid = entry.id[0].split('-').pop()

                    var published = entry.published;
                    var draft = 'false';
                    if(entry['app:control'] && (entry['app:control'][0]['app:draft'][0] == 'yes')){
                        draft =  'true';
                    }

                    console.log(`title: "${title}"`);
                    console.log(`date: ${published}`);
                    console.log(`draft: ${draft}`);

                    var sanitizedTitle = getFileName(title)

                    var urlLink = entry.link.filter(function(link){
                        return link["$"].type && link["$"].rel && link["$"].rel=='alternate' && link["$"].type=='text/html'
                    });

                    var url=''

                    // console.dir(urlLink[0]);
                    if (urlLink && urlLink[0] && urlLink[0]['$'] && urlLink[0]['$'].href){
                        url = urlLink[0]['$'].href;
                    }

                    var fname = outputDir + '/' + path.basename(sanitizedTitle) + '.md';
                    console.log(fname);
                    postMap.postName = fname
                    postMap.fname = fname.replace('.md', '-comments.md');
                    postMap.comments = [];


                    if (entry.content && entry.content[0] && entry.content[0]['_']){
                        // console.log('content available');
                        content = entry.content[0]['_'];
                        markdown = tds.turndown(content);
                        // console.log(markdown);


                    }

                    var tagLabel = [];
                    var tags = [];


                    tagLabel = entry.category.filter(function (tag){
                        // console.log(`tagged against :${tag['$'].term}`);
                        return tag['$'].term && tag['$'].term.indexOf('http://schemas.google')==-1;
                    });
                    console.log(`No of category: ${entry.category.length}`);
                    tagLabel.forEach(function(tag){
                        // console.log(`tagged against :${tag['$'].term}`);
                        tags.push(tag['$'].term);
                    });


                    console.log(`tags: \n${tags.map(a=> '- '+a).join('\n')}\n`);

                    var tagString='';

                    if(tags.length){
                        tagString=`tags: \n${tags.map(a=> '- '+a).join('\n')}\n`;
                    }

                    console.dir(postMap);

                    console.log("\n\n\n\n\n");

                    var alias = url.replace(/^.*\/\/[^\/]+/, '');

                    //fileHeader = `---\ntitle: '${title}'\ndate: ${published}\ndraft: ${draft}\nurl: ${alias}\n${tagString}---\n`;
                    fileContent = `${fileHeader}\n${markdown}`;

                    postMap.header = fileHeader;
                    postMaps[postMap.pid] = postMap;

                    writeToFile(fname, fileContent)

                });


            comments.forEach(function(entry){
                // var commentMap = {};
                var comment = {published:'', title:'', content:''};

                var postId = entry['thr:in-reply-to'][0]["$"]["source"];
                postId = path.basename(postId);

                comment.published = entry['published'][0];

                if(entry['title'][0] && entry['title'][0]["_"]){
                    comment.title = tds.turndown(entry['title'][0]["_"]);    
                }

                if (entry['content'][0] && entry['content'][0]["_"]){
                    comment.content = tds.turndown(entry['content'][0]["_"]);    
                }

                comment.author = {name: '', email: '', url: ''};

                if(entry['author'][0]["name"] && entry['author'][0]["name"][0]){
                    comment.author.name = entry['author'][0]["name"][0];    
                }

                if (entry['author'][0]["email"] && entry['author'][0]["email"][0]){
                    comment.author.email = entry['author'][0]["email"][0];    
                }

                if (entry['author'][0]["uri"] && entry['author'][0]["uri"][0]){
                    comment.author.url = entry['author'][0]["uri"][0];    
                }

                postMaps[postId].comments.push(comment);
            });

            // console.log(JSON.stringify(postMaps)); return;
            writeComments(postMaps);

            }
            console.log('Done');
        });
});

}


// function writeComments(postMaps){

//     if (mergeComments == 'm'){
//         console.log('DEBUG: merge comments requested');
//     }else{
//         console.log('DEBUG: separate comments requested (defaulted)');
//     }
//     for (var pmap in postMaps){
//         var comments = postMaps[pmap].comments;
//         console.log(`post id: ${pmap} has ${comments.length} comments`);
//         // console.dir(comments);

//         if (comments.length){
//             var ccontent = '';
//             comments.forEach(function(comment){
//                 var readableDate = '<time datetime="'+comment.published+'">' + moment(comment.published).format("MMM d, YYYY") + '</time>';

//                 ccontent += `#### ${comment.title}\n[${comment.author.name}](${comment.author.url} "${comment.author.email}") - ${readableDate}\n\n${comment.content}\n<hr />\n`;
//             });

//             if (mergeComments == 'm'){
//                 writeToFile(postMaps[pmap].postName, `\n---\n### Comments:\n${ccontent}`, true);
//             }else{
//                 writeToFile(postMaps[pmap].fname, `${postMaps[pmap].header}\n${ccontent}`);
//             }

//         }
//     }
// }



function writeToFile(filename, content, append=false){

    if(append){
        console.log(`DEBUG: going to append to ${filename}`);
        try{
            fs.appendFileSync(filename, content);
            console.log(`Successfully appended to ${filename}`);
        }
        catch(err){
            console.log(`Error while appending to ${filename} - ${JSON.stringify(err)}`);
            console.dir(err);
        }

    }else{
        console.log(`DEBUG: going to write to ${filename}`);
        try{
            fs.writeFileSync(filename, content);
            console.log(`Successfully written to ${filename}`);
        }
        catch(err){
            console.log(`Error while writing to ${filename} - ${JSON.stringify(err)}`);
            console.dir(err);
        }
    }

}

问题

当使用 npm install 时出现报错

npm notice
npm notice New major version of npm available! 8.1.2 -> 10.8.3
npm notice Changelog: https://github.com/npm/cli/releases/tag/v10.8.3
npm notice Run npm install -g npm@10.8.3 to update!
npm notice
npm ERR! code CERT_HAS_EXPIRED
npm ERR! errno CERT_HAS_EXPIRED
npm ERR! request to https://registry.npm.taobao.org/xmlbuilder/-/xmlbuilder-9.0.7.tgz failed, reason: certificate has expired

npm ERR! A complete log of this run can be found in:
npm ERR!     C:\Users\asus\AppData\Local\npm-cache\_logs\2024-09-18T13_02_58_438Z-debug.log

img

通过查看日志发现是淘宝镜像有问题。

重新设置国内镜像

npm config set registry=https://registry.npmmirror.com 

//执行以下命令查看是否配置成功
npm config get registry
创作不易,如果您觉得有帮助,请支持LIncol29!
如有需要,请至网站地图学习本博客的教程
博客订阅:通过RSS或关注公众号[Lincol的编程世界] | 广告招租与合作请留言
本文链接:https://www.lincol29.cn/convert-wordpress-tomd
版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0协议转载请注明文章地址及作者哦~

评论

  1. Linux Chrome
    3 天前
    2025-1-11 8:26:44

    那如果用MD转回WP会不会出现兼容性问题呢😳 前两天我也试了一下用WP转TP 然后又用TP转回WP 全局转换 多多少少会出点问题 如果用插件就不会 但不想涉及插件 所以就自己试了 看到你发这个所以我也想分享出来😂

    • 博主
      刘郎
      Windows Chrome
      3 天前
      2025-1-11 9:29:09

      会有一些问题,比如WP的表格可以转换为markdown形式。但如果md转回WP,表格不会正常显示。还有一个就是WP转MD的时候你的代码语言会消失,变成默认plain。

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇