Webスクレイピングにおける逆方向解析と暗号化技術

Webスクレイピングの高度化手法と暗号化・復号化の実装

  1. 法令文書サイトの解析
navigator = {};
window = global;
Date.prototype.format = function (fmt) {
    var o = {
        "M+": this.getMonth() + 1,
        "d+": this.getDate(),
        "h+": this.getHours(),
        "m+": this.getMinutes(),
        "s+": this.getSeconds(),
        "q+": Math.floor((this.getMonth() + 3) / 3),
        "S": this.getMilliseconds()
    };
    if (/(y+)/.test(fmt)) {
        fmt = fmt.replace(RegExp.$1, (this.getFullYear() + "").substr(4 - RegExp.$1.length));
    }
    for (var k in o) {
        if (new RegExp("(" + k + ")").test(fmt)) {
            fmt = fmt.replace(RegExp.$1, (RegExp.$1.length == 1) ? (o[k]) : (("00" + o[k]).substr(("" + o[k]).length)));
        }
    }
    return fmt;
}

var CryptoUtil = CryptoUtil || function (math) {
    var lib = {}, core = lib.lib = {}, Base = function () {
    }, baseExtend = core.Base = {
        extend: function (properties) {
            Base.prototype = this;
            var newObj = new Base;
            properties && newObj.mixIn(properties);
            newObj.hasOwnProperty("init") || (newObj.init = function () {
                newObj.$super.init.apply(this, arguments)
            });
            newObj.init.prototype = newObj;
            newObj.$super = this;
            return newObj
        }, create: function () {
            var instance = this.extend();
            instance.init.apply(instance, arguments);
            return instance
        }, init: function () {
        }, mixIn: function (properties) {
            for (var key in properties) {
                properties.hasOwnProperty(key) && (this[key] = properties[key])
            }
            properties.hasOwnProperty("toString") && (this.toString = properties.toString)
        }, clone: function () {
            return this.init.prototype.extend(this)
        }
    }, WordArray = core.WordArray = baseExtend({
        init: function (words, sigBytes) {
            words = this.words = words || [];
            this.sigBytes = sigBytes != undefined ? sigBytes : 4 * words.length
        }, toString: function (encoder) {
            return (encoder || HexEncoder).stringify(this)
        }, concat: function (wordArray) {
            var thisWords = this.words, thatWords = wordArray.words, thisSigBytes = this.sigBytes;
            thatSigBytes = wordArray.sigBytes;
            this.clamp();
            if (thisSigBytes % 4) {
                for (var i = 0; i < thatSigBytes; i++) {
                    thisWords[thisSigBytes + i >>> 2] |= (thatWords[i >>> 2] >>> 24 - 8 * (i % 4) & 255) << 24 - 8 * ((thisSigBytes + i) % 4)
                }
            } else {
                if (65535 < thatWords.length) {
                    for (i = 0; i < thatSigBytes; i += 4) {
                        thisWords[thisSigBytes + i >>> 2] = thatWords[i >>> 2]
                    }
                } else {
                    thisWords.push.apply(thisWords, thatWords)
                }
            }
            this.sigBytes += thatSigBytes;
            return this
        }, clamp: function () {
            var words = this.words, sigBytes = this.sigBytes;
            words[sigBytes >>> 2] &= 4294967295 << 32 - 8 * (sigBytes % 4);
            words.length = math.ceil(sigBytes / 4)
        }, clone: function () {
            var clone = baseExtend.clone.call(this);
            clone.words = this.words.slice(0);
            return clone
        }, random: function (byteLength) {
            for (var words = [], i = 0; i < byteLength; i += 4) {
                words.push(4294967296 * math.random() | 0)
            }
            return new WordArray.init(words, byteLength)
        }
    }), encoders = lib.enc = {}, HexEncoder = encoders.Hex = {
        stringify: function (wordArray) {
            var words = wordArray.words;
            sigBytes = wordArray.sigBytes;
            for (var hex = [], i = 0; i < sigBytes; i++) {
                var byte = words[i >>> 2] >>> 24 - 8 * (i % 4) & 255;
                hex.push((byte >>> 4).toString(16));
                hex.push((byte & 15).toString(16))
            }
            return hex.join("")
        }, parse: function (hexStr) {
            for (var hexStrLength = hexStr.length, words = [], i = 0; i < hexStrLength; i += 2) {
                words[i >>> 3] |= parseInt(hexStr.substr(i, 2), 16) << 24 - 4 * (i % 8)
            }
            return new WordArray.init(words, hexStrLength / 2)
        }
    }, Latin1Encoder = encoders.Latin1 = {
        stringify: function (wordArray) {
            var words = wordArray.words;
            sigBytes = wordArray.sigBytes;
            for (var str = [], i = 0; i < sigBytes; i++) {
                str.push(String.fromCharCode(words[i >>> 2] >>> 24 - 8 * (i % 4) & 255))
            }
            return str.join("")
        }, parse: function (latin1Str) {
            for (var strLength = latin1Str.length, words = [], i = 0; i < strLength; i++) {
                words[i >>> 2] |= (latin1Str.charCodeAt(i) & 255) << 24 - 8 * (i % 4)
            }
            return new WordArray.init(words, strLength)
        }
    }, Utf8Encoder = encoders.Utf8 = {
        stringify: function (wordArray) {
            try {
                return decodeURIComponent(escape(Latin1Encoder.stringify(wordArray)))
            } catch (error) {
                throw Error("Malformed UTF-8 data")
            }
        }, parse: function (utf8Str) {
            return Latin1Encoder.parse(unescape(encodeURIComponent(utf8Str)))
        }
    }, BufferedBlockAlgorithm = core.BufferedBlockAlgorithm = baseExtend({
        reset: function () {
            this._data = new WordArray.init;
            this._nDataBytes = 0
        }, _append: function (data) {
            "string" == typeof data && (data = Utf8Encoder.parse(data));
            this._data.concat(data);
            this._nDataBytes += data.sigBytes
        }, _process: function (doFlush) {
            var data = this._data, dataWords = data.words, dataSigBytes = data.sigBytes, blockSize = this.blockSize,
                nBlocksReady = dataSigBytes / (4 * blockSize),
                nBlocksReady = doFlush ? math.ceil(nBlocksReady) : math.max((nBlocksReady | 0) - this._minBufferSize, 0);
            doFlush = nBlocksReady * blockSize;
            dataSigBytes = math.min(4 * doFlush, dataSigBytes);
            if (doFlush) {
                for (var i = 0; i < doFlush; i += blockSize) {
                    this._doProcessBlock(dataWords, i)
                }
                i = dataWords.splice(0, doFlush);
                data.sigBytes -= dataSigBytes
            }
            return new WordArray.init(i, dataSigBytes)
        }, clone: function () {
            var clone = baseExtend.clone.call(this);
            clone._data = this._data.clone();
            return clone
        }, _minBufferSize: 0
    });
    core.Hasher = BufferedBlockAlgorithm.extend({
        cfg: baseExtend(), init: function (config) {
            this.cfg = this.cfg.extend(config);
            this.reset()
        }, reset: function () {
            BufferedBlockAlgorithm.reset.call(this);
            this._doReset()
        }, update: function (messageUpdate) {
            this._append(messageUpdate);
            this._process();
            return this
        }, finalize: function (messageUpdate) {
            messageUpdate && this._append(messageUpdate);
            return this._doFinalize()
        }, blockSize: 16, _createHelper: function (hasher) {
            return function (message, cfg) {
                return (new hasher.init(cfg)).finalize(message)
            }
        }, _createHmacHelper: function (hasher) {
            return function (message, key) {
                return (new HMAC.init(hasher, key)).finalize(message)
            }
        }
    });
    var algorithms = lib.algo = {};
    return lib
}(Math);

// TripleDES暗号化ユーティリティ
var TripleDESUtil = {
    getIV: function () {
        return new Date().format("yyyyMMdd")
    }, encrypt: function (plaintext, key, iv) {
        if (key) {
            return (CryptoJS.TripleDES.encrypt(plaintext, CryptoJS.enc.Utf8.parse(key), {
                iv: CryptoJS.enc.Utf8.parse(iv || TripleDESUtil.getIV()),
                mode: CryptoJS.mode.CBC,
                padding: CryptoJS.pad.Pkcs7
            })).toString()
        }
        return ""
    }, decrypt: function (ciphertext, key, iv) {
        if (key) {
            return CryptoJS.enc.Utf8.stringify(CryptoJS.TripleDES.decrypt(ciphertext, CryptoJS.enc.Utf8.parse(key), {
                iv: CryptoJS.enc.Utf8.parse(iv || TripleDESUtil.getIV()),
                mode: CryptoJS.mode.CBC,
                padding: CryptoJS.pad.Pkcs7
            })).toString()
        }
        return ""
    }
};

function generateRandomString(length) {
    var result = "";
    var chars = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
    for (var i = 0; i < length; i++) {
        result += chars[Math.round(Math.random() * (chars.length - 1))];
    }
    return result;
}

function generateCipher() {
    var now = new Date();
    var timestamp = now.getTime().toString();
    var randomSalt = generateRandomString(24);
    var year = now.getFullYear().toString();
    var month = (now.getMonth() + 1 < 10 ? "0" + (now.getMonth() + 1) : now.getMonth()).toString();
    var day = (now.getDate() < 10 ? "0" + now.getDate() : now.getDate()).toString();
    var iv = year + month + day;
    var encrypted = TripleDESUtil.encrypt(timestamp, randomSalt, iv).toString();
    var combinedStr = randomSalt + iv + encrypted;
    var binaryResult = stringToBinary(combinedStr);
    return binaryResult;
}

function stringToBinary(str) {
    var result = [];
    var chars = str.split("");
    for (var i = 0; i < chars.length; i++) {
        if (i != 0) {
            result.push(" ");
        }
        var char = chars[i];
        var binaryChar = char.charCodeAt().toString(2);
        result.push(binaryChar);
    }
    return result.join("");
}
  1. 民俗文化サイトのCookie生成と暗号化
// HMAC-SHA1署名生成関数
function generateSignature(param1, param2, secretKey) {
    var CryptoJS = require('crypto-js');
    return CryptoJS.HmacSHA1(param1 + param2, secretKey).toString();
}

// AES暗号化関数
function encryptRequestData(userAgent, memberId) {
    var secretKey = "SecretEncryptionKey2023";
    
    var requestPayload = JSON.stringify({
        "browserInfo": {
            "userAgent": userAgent,
            "webdriver": false,
            "language": "ja-JP",
            "colorDepth": 24,
            "deviceMemory": 8,
            "hardwareConcurrency": 8,
            "screenResolution": [1920, 1080],
            "availableScreenResolution": [1920, 1040],
            "timezoneOffset": -540,
            "timezone": "Asia/Tokyo",
            "sessionStorage": true,
            "localStorage": true,
            "indexedDb": true,
            "addBehavior": false,
            "openDatabase": true,
            "cpuClass": "unknown",
            "platform": "Win32",
            "plugins": "abcdef1234567890abcdef1234567890abcdef12",
            "canvas": generateCanvasFingerprint(),
            "webgl": generateWebGLFingerprint(),
            "webglVendorAndRenderer": "Google Inc. (NVIDIA)~ANGLE (NVIDIA, NVIDIA GeForce GTX 1060 6GB Direct3D11 vs_5_0 ps_5_0, D3D11)",
            "adBlock": false,
            "hasLiedLanguages": false,
            "hasLiedResolution": false,
            "hasLiedOs": false,
            "hasLiedBrowser": false,
            "touchSupport": generateTouchFingerprint(),
            "fonts": generateFontFingerprint(),
            "audio": generateAudioFingerprint(),
            "timestamp": new Date().getTime(),
            "devicePixelRatio": window.devicePixelRatio || 1
        }, 
        "userContext": {
            "memberId": memberId,
            "sessionId": generateSessionId()
        }
    });
    
    return CryptoJS.AES.encrypt(requestPayload, CryptoJS.enc.Utf8.parse(secretKey), {
        iv: CryptoJS.enc.Utf8.parse(secretKey.substring(0, 16)),
        mode: CryptoJS.mode.CBC,
        padding: CryptoJS.pad.Pkcs7
    }).ciphertext.toString();
}

// 各種フィンガープリント生成関数
function generateCanvasFingerprint() {
    var canvas = document.createElement('canvas');
    var ctx = canvas.getContext('2d');
    ctx.textBaseline = 'top';
    ctx.font = '14px Arial';
    ctx.fillText('Canvas fingerprint ' + Math.random(), 2, 2);
    return canvas.toDataURL().slice(-50);
}

function generateWebGLFingerprint() {
    var canvas = document.createElement('canvas');
    var gl = canvas.getContext('webgl');
    if (!gl) return 'no-webgl';
    return gl.getParameter(gl.VENDOR) + gl.getParameter(gl.RENDERER);
}

function generateTouchFingerprint() {
    return 'ontouchstart' in window ? navigator.maxTouchPoints.toString() : '0';
}

function generateFontFingerprint() {
    return 'Arial,Helvetica,Verdana,Times,Georgia,Courier'.split(',').map(font => {
        return document.fonts.check('12px ' + font) ? '1' : '0';
    }).join('');
}

function generateAudioFingerprint() {
    var AudioContext = window.AudioContext || window.webkitAudioContext;
    if (!AudioContext) return 'no-audio';
    var context = new AudioContext();
    var oscillator = context.createOscillator();
    var analyser = context.createAnalyser();
    var gain = context.createGain();
    var scriptProcessor = context.createScriptProcessor(4096, 1, 1);
    
    oscillator.type = 'triangle';
    oscillator.frequency.setValueAtTime(10000, context.currentTime);
    
    gain.gain.setValueAtTime(0, context.currentTime);
    
    oscillator.connect(analyser);
    analyser.connect(scriptProcessor);
    scriptProcessor.connect(gain);
    gain.connect(context.destination);
    
    oscillator.start(0);
    
    return Math.random().toString(36).substring(7);
}

function generateSessionId() {
    return 'session_' + Date.now() + '_' + Math.random().toString(36).substring(2);
}

// Cookie生成用のヘルパー関数
function generateCookieValue(params, urlPath) {
    var sortedParams = [];
    for (let key in params) {
        sortedParams.push(key + "=" + params[key]);
    }
    
    var encodingMap = {
        "%3A": ":",
        "%2C": ",",
        "%5D": "]",
        "%5B": "[",
        "%20": "+",
        "%40": "@"
    };
    
    var queryString = urlPath + "?" + sortedParams.sort().join('&');
    
    queryString = Object.keys(encodingMap).reduce(function(str, encoded) {
        return str.replace(new RegExp("(" + encoded + ")", "g"), encodingMap[encoded]);
    }, queryString);
    
    var hash = CryptoJS.MD5(urlPath).toString();
    return CryptoJS.HmacSHA1(queryString, hash).toString();
}

本記事は教育目的で作成されたものであり、実際のWebサイトのスクレイピングには各サイトの利用規約を遵守してください。

タグ: Webスクレイピング 暗号化 TripleDES HMAC-SHA1 AES

5月27日 07:04 投稿