Query Internet Archive CDX

import requests
r = requests.get('http://web.archive.org/cdx/search/cdx', 
                 params={'url': 'skeptric.com/about/', 'output': 'json', 'to': '202111'})

captures = r.json()

import pandas as pd
df = pd.DataFrame(captures[1:], columns=captures[0])
df
urlkey timestamp original mimetype statuscode digest length
0 com,skeptric)/about 20211120235913 https://skeptric.com/about/ text/html 200 Z5NRUTRW3XTKZDCJFDKGPJ5BWIBNQCG7 3266
record = df.iloc[0]
record
urlkey                     com,skeptric)/about
timestamp                       20211120235913
original           https://skeptric.com/about/
mimetype                             text/html
statuscode                                 200
digest        Z5NRUTRW3XTKZDCJFDKGPJ5BWIBNQCG7
length                                    3266
Name: 0, dtype: object

Fetch Original and Archived Content

We can get the version available from the Wayback Machine like this:

wayback_url = f'http://web.archive.org/web/{record.timestamp}/{record.original}'
wayback_content = requests.get(wayback_url).content

The digests don’t match because the Internet Archive changes the HTML

from hashlib import sha1
from base64 import b32encode

def sha1_digest(content: bytes) -> str:
    return b32encode(sha1(content).digest()).decode('ascii')
sha1_digest(wayback_content)
'OXZ5C2VPDFFRV6U3CCNM6QT7VKND6SSE'

However we can get the original HTML captured:

original_url = f'http://web.archive.org/web/{record.timestamp}id_/{record.original}'
original_content = requests.get(original_url).content

And the SHA-1 matches the CDX record

sha1_digest(original_content) == record['digest']
True

Here’s a capture of the website I made on 2021-12-01

dec21_content = requests.get('https://raw.githubusercontent.com/EdwardJRoss/skeptric/98419583bc0c7b71ab9469250bbed924cdac448d/static/resources/about.html').content

And it’s byte-for-byte the same are the snapshot

dec21_content == original_content
True
sha1_digest(dec21_content)
'Z5NRUTRW3XTKZDCJFDKGPJ5BWIBNQCG7'

Removing Headers

The Wayback Machine version injects a header just after the <head> tag down to <-- End Wayback Rewrite JS Include-->.

It looks like a bit of javascript and some CSS (likely for tracking and adding banners, search, etc)

print(wayback_content[:1500].decode('utf-8'))
<!DOCTYPE html>
<html lang="en-us">
<head><script src="//archive.org/includes/analytics.js?v=cf34f82" type="text/javascript"></script>
<script type="text/javascript">window.addEventListener('DOMContentLoaded',function(){var v=archive_analytics.values;v.service='wb';v.server_name='wwwb-app216.us.archive.org';v.server_ms=347;archive_analytics.send_pageview({});});</script>
<script type="text/javascript" src="/_static/js/bundle-playback.js?v=UfTkgsKx" charset="utf-8"></script>
<script type="text/javascript" src="/_static/js/wombat.js?v=UHAOicsW" charset="utf-8"></script>
<script type="text/javascript">
  __wm.init("http://web.archive.org/web");
  __wm.wombat("https://skeptric.com/about/","20211120235913","http://web.archive.org/","web","/_static/",
          "1637452753");
</script>
<link rel="stylesheet" type="text/css" href="/_static/css/banner-styles.css?v=omkqRugM" />
<link rel="stylesheet" type="text/css" href="/_static/css/iconochive.css?v=qtvMKcIJ" />
<!-- End Wayback Rewrite JS Include -->

    <meta charset="utf-8"/>
    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>

    
    
    <title>About Skeptric · </title>

    <meta name="HandheldFriendly" content="True"/>
    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>

    
    <link rel="stylesheet" href="http://web.archive.org/web/20211120235913cs_/https://skeptric.com/style.main.min.5ea2f07be7e07e221a7112a3095b89d049b96c48b831f16f1015bf2d95d914e5.css"/>
<script async src="http://web.archiv

It’s missing from the original

print(original_content[:500].decode('utf-8'))
<!DOCTYPE html>
<html lang="en-us">
<head>
    <meta charset="utf-8" />
    <meta http-equiv="X-UA-Compatible" content="IE=edge" />

    
    
    <title>About Skeptric · </title>

    <meta name="HandheldFriendly" content="True" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />

    
    <link rel="stylesheet" href="https://skeptric.com/style.main.min.5ea2f07be7e07e221a7112a3095b89d049b96c48b831f16f1015bf2d95d914e5.css" />
<script async src="https://www.googletagm
def remove_wayback_header(content):
    _start = b'<script src="//archive.org/includes/analytics.js'
    _end = b'<!-- End Wayback Rewrite JS Include -->\n'
    start_idx = content.find(_start)
    end_idx = content.find(_end)
    if start_idx < 0 or end_idx < 0:
        raise ValueError("Could not find")
    return content[:start_idx] + content[end_idx+len(_end):]

After removing the header the start looks the same (except for the URL rewriting at the end)

print(remove_wayback_header(wayback_content)[:500].decode('utf-8'))
<!DOCTYPE html>
<html lang="en-us">
<head>
    <meta charset="utf-8"/>
    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>

    
    
    <title>About Skeptric · </title>

    <meta name="HandheldFriendly" content="True"/>
    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>

    
    <link rel="stylesheet" href="http://web.archive.org/web/20211120235913cs_/https://skeptric.com/style.main.min.5ea2f07be7e07e221a7112a3095b89d049b96c48b831f16f1015bf2d95d914e5.css"/>
<

Removing Footers

The Wayback Machine adds a bunch of footers about the capture and the archival and copyright notice.

print(wayback_content[-1000:].decode('utf-8'))
="MathJax-script" async src="http://web.archive.org/web/20211120235913js_/https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>


<script src="http://web.archive.org/web/20211120235913js_/https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
<script>mermaid.initialize({ startOnLoad: true, securityLevel: 'loose' });</script>

</footer>

    </div>

</body>
</html>
<!--
     FILE ARCHIVED ON 23:59:13 Nov 20, 2021 AND RETRIEVED FROM THE
     INTERNET ARCHIVE ON 10:20:48 Dec 01, 2021.
     JAVASCRIPT APPENDED BY WAYBACK MACHINE, COPYRIGHT INTERNET ARCHIVE.

     ALL OTHER CONTENT MAY ALSO BE PROTECTED BY COPYRIGHT (17 U.S.C.
     SECTION 108(a)(3)).
-->
<!--
playback timings (ms):
  captures_list: 204.223
  exclusion.robots: 0.095
  exclusion.robots.policy: 0.087
  RedisCDXSource: 21.732
  esindex: 0.008
  LoadShardBlock: 161.202 (3)
  PetaboxLoader3.datanode: 170.534 (4)
  CDXLines.iter: 18.668 (3)
  load_resource: 137.66
  PetaboxLoader3.resolve: 57.198
-->
print(original_content[-500:].decode('utf-8'))
ard-ross-4909ba13a/" target="_blank" rel="noopener">LinkedIn</a>
                </nav>
            </div>
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>


<script src="https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
<script>mermaid.initialize({ startOnLoad: true, securityLevel: 'loose' });</script>

</footer>

    </div>

</body>
</html>

We can roughly remove this by looking for the FILE ARCHIVED ON:

def remove_wayback_footer(content):
    _prefix = b'</html>\n'
    _start = _prefix + b'<!--\n     FILE ARCHIVED ON '
    start_idx = content.find(_start)
    if start_idx < 0:
        raise ValueError("Could not find")
    return content[:start_idx + len(_prefix)]
print(remove_wayback_footer(wayback_content)[-500:].decode('utf-8'))
ive.org/web/20211120235913js_/https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
<script id="MathJax-script" async src="http://web.archive.org/web/20211120235913js_/https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>


<script src="http://web.archive.org/web/20211120235913js_/https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
<script>mermaid.initialize({ startOnLoad: true, securityLevel: 'loose' });</script>

</footer>

    </div>

</body>
</html>

And the rest

We can put all our changes together

def remove_wayback_changes(content, timestamp):
    content = remove_wayback_header(content)
    content = remove_wayback_footer(content)
    content = remove_wayback_links(content, timestamp)
    return content
clean_wayback_content = remove_wayback_changes(wayback_content, record['timestamp'])
clean_wayback_content == original_content
False
from difflib import SequenceMatcher
seqmatcher = SequenceMatcher(isjunk=None,
                             a=original_content,
                             b=clean_wayback_content,
                             autojunk=False)

context_before = context_after = 20

for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():
        if tag == 'equal':
            continue

        a_min = max(a0 - context_before, 0)
        a_max = min(a1 + context_after, len(seqmatcher.a))
        print(seqmatcher.a[a_min:a_max])

        b_min = max(b0 - context_before, 0)
        b_max = min(b1 + context_after, len(seqmatcher.b))
        print(seqmatcher.b[b_min:b_max])
        print()
b'meta charset="utf-8" />\n    <meta http-eq'
b'meta charset="utf-8"/>\n    <meta http-eq'

b'e" content="IE=edge" />\n\n    \n    \n    <t'
b'e" content="IE=edge"/>\n\n    \n    \n    <t'

b'ndly" content="True" />\n    <meta name="v'
b'ndly" content="True"/>\n    <meta name="v'

b', initial-scale=1.0" />\n\n    \n    <link r'
b', initial-scale=1.0"/>\n\n    \n    <link r'

b'015bf2d95d914e5.css" />\n<script async src'
b'015bf2d95d914e5.css"/>\n<script async src'

b'"menuitem"><a href="/about/">About</a></'
b'"menuitem"><a href="https://skeptric.com/about/">About</a></'

b'"menuitem"><a href="/">Home</a></li>\n   '
b'"menuitem"><a href="https://skeptric.com/">Home</a></li>\n   '

b'https://skeptric.com">skeptric.com</a>.<'
b'https://skeptric.com/">skeptric.com</a>.<'

We can mangle the original HTML to get the same result. I doubt this would be general enough to work on other pages, but gives a flavour of the changes.

import re
def wayback_normalise_content(content, base_url):
    url = base_url.encode('ascii')
    content = re.sub(b' */>', b'/>', content)
    content = content.replace(b'href="/', b'href="' + url + b'/')
    content = re.sub(b'href="' + url + b'"', b'href="' + url + b'/"', content)
    return content
assert wayback_normalise_content(original_content, 'https://skeptric.com') == clean_wayback_content