{ "cells": [ { "cell_type": "markdown", "id": "dda293ea", "metadata": {}, "source": [ "# Query Internet Archive CDX" ] }, { "cell_type": "code", "execution_count": 1, "id": "51110b96", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlkeytimestamporiginalmimetypestatuscodedigestlength
0com,skeptric)/about20211120235913https://skeptric.com/about/text/html200Z5NRUTRW3XTKZDCJFDKGPJ5BWIBNQCG73266
\n", "
" ], "text/plain": [ " urlkey timestamp original \\\n", "0 com,skeptric)/about 20211120235913 https://skeptric.com/about/ \n", "\n", " mimetype statuscode digest length \n", "0 text/html 200 Z5NRUTRW3XTKZDCJFDKGPJ5BWIBNQCG7 3266 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import requests\n", "r = requests.get('http://web.archive.org/cdx/search/cdx', \n", " params={'url': 'skeptric.com/about/', 'output': 'json', 'to': '202111'})\n", "\n", "captures = r.json()\n", "\n", "import pandas as pd\n", "df = pd.DataFrame(captures[1:], columns=captures[0])\n", "df" ] }, { "cell_type": "code", "execution_count": 2, "id": "8099a6d0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "urlkey com,skeptric)/about\n", "timestamp 20211120235913\n", "original https://skeptric.com/about/\n", "mimetype text/html\n", "statuscode 200\n", "digest Z5NRUTRW3XTKZDCJFDKGPJ5BWIBNQCG7\n", "length 3266\n", "Name: 0, dtype: object" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "record = df.iloc[0]\n", "record" ] }, { "cell_type": "markdown", "id": "e30e9d52", "metadata": {}, "source": [ "# Fetch Original and Archived Content" ] }, { "cell_type": "markdown", "id": "3fe08c54", "metadata": {}, "source": [ "We can get the version available from the Wayback Machine like this:" ] }, { "cell_type": "code", "execution_count": 3, "id": "09cd1602", "metadata": {}, "outputs": [], "source": [ "wayback_url = f'http://web.archive.org/web/{record.timestamp}/{record.original}'\n", "wayback_content = requests.get(wayback_url).content" ] }, { "cell_type": "markdown", "id": "198883db", "metadata": {}, "source": [ "The digests don't match because the Internet Archive [changes the HTML](https://archive.org/post/1009990/cdx-digest-not-accurately-capturing-duplicates)" ] }, { "cell_type": "code", "execution_count": 4, "id": "9e853f59", "metadata": {}, "outputs": [], "source": [ "from hashlib import sha1\n", "from base64 import b32encode\n", "\n", "def sha1_digest(content: bytes) -> str:\n", " return b32encode(sha1(content).digest()).decode('ascii')" ] }, { "cell_type": "code", "execution_count": 5, "id": "cb7981bf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'OXZ5C2VPDFFRV6U3CCNM6QT7VKND6SSE'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sha1_digest(wayback_content)" ] }, { "cell_type": "markdown", "id": "3ac8b2fd", "metadata": {}, "source": [ "However we can get the original HTML captured:" ] }, { "cell_type": "code", "execution_count": 6, "id": "c33bd990", "metadata": {}, "outputs": [], "source": [ "original_url = f'http://web.archive.org/web/{record.timestamp}id_/{record.original}'\n", "original_content = requests.get(original_url).content" ] }, { "cell_type": "markdown", "id": "606d85a5", "metadata": {}, "source": [ "And the SHA-1 matches the CDX record" ] }, { "cell_type": "code", "execution_count": 7, "id": "aedf83a3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sha1_digest(original_content) == record['digest']" ] }, { "cell_type": "markdown", "id": "be0d69bc", "metadata": {}, "source": [ "Here's a capture of the website I made on 2021-12-01" ] }, { "cell_type": "code", "execution_count": 8, "id": "df5f5a6c", "metadata": {}, "outputs": [], "source": [ "dec21_content = requests.get('https://raw.githubusercontent.com/EdwardJRoss/skeptric/98419583bc0c7b71ab9469250bbed924cdac448d/static/resources/about.html').content" ] }, { "cell_type": "markdown", "id": "61cf49f4", "metadata": {}, "source": [ "And it's byte-for-byte the same are the snapshot" ] }, { "cell_type": "code", "execution_count": 9, "id": "b140e8a7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dec21_content == original_content" ] }, { "cell_type": "code", "execution_count": 10, "id": "b5f217cd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Z5NRUTRW3XTKZDCJFDKGPJ5BWIBNQCG7'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sha1_digest(dec21_content)" ] }, { "cell_type": "markdown", "id": "d5327e5a", "metadata": {}, "source": [ "# Removing Headers" ] }, { "cell_type": "markdown", "id": "110f5f38", "metadata": {}, "source": [ "The Wayback Machine version injects a header just after the `` tag down to `<-- End Wayback Rewrite JS Include-->`.\n", "\n", "It looks like a bit of javascript and some CSS (likely for tracking and adding banners, search, etc)" ] }, { "cell_type": "code", "execution_count": 11, "id": "01ff19ca", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", " \n", " \n", " About Skeptric · \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "\n", "\n" ] } ], "source": [ "print(wayback_content[-1000:].decode('utf-8'))" ] }, { "cell_type": "code", "execution_count": 16, "id": "f3249757", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ard-ross-4909ba13a/\" target=\"_blank\" rel=\"noopener\">LinkedIn\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "\n" ] } ], "source": [ "print(original_content[-500:].decode('utf-8'))" ] }, { "cell_type": "markdown", "id": "8d718dd2", "metadata": {}, "source": [ "We can roughly remove this by looking for the FILE ARCHIVED ON:" ] }, { "cell_type": "code", "execution_count": 17, "id": "8923bc33", "metadata": {}, "outputs": [], "source": [ "def remove_wayback_footer(content):\n", " _prefix = b'\\n'\n", " _start = _prefix + b'