import requests
r = requests.get('http://web.archive.org/cdx/search/cdx' ,
params= {'url' : 'skeptric.com/about/' , 'output' : 'json' , 'to' : '202111' })
captures = r.json()
import pandas as pd
df = pd.DataFrame(captures[1 :], columns= captures[0 ])
df
0
com,skeptric)/about
20211120235913
https://skeptric.com/about/
text/html
200
Z5NRUTRW3XTKZDCJFDKGPJ5BWIBNQCG7
3266
record = df.iloc[0 ]
record
urlkey com,skeptric)/about
timestamp 20211120235913
original https://skeptric.com/about/
mimetype text/html
statuscode 200
digest Z5NRUTRW3XTKZDCJFDKGPJ5BWIBNQCG7
length 3266
Name: 0, dtype: object
Fetch Original and Archived Content
We can get the version available from the Wayback Machine like this:
wayback_url = f'http://web.archive.org/web/ { record. timestamp} / { record. original} '
wayback_content = requests.get(wayback_url).content
The digests don’t match because the Internet Archive changes the HTML
from hashlib import sha1
from base64 import b32encode
def sha1_digest(content: bytes ) -> str :
return b32encode(sha1(content).digest()).decode('ascii' )
sha1_digest(wayback_content)
'OXZ5C2VPDFFRV6U3CCNM6QT7VKND6SSE'
However we can get the original HTML captured:
original_url = f'http://web.archive.org/web/ { record. timestamp} id_/ { record. original} '
original_content = requests.get(original_url).content
And the SHA-1 matches the CDX record
sha1_digest(original_content) == record['digest' ]
Here’s a capture of the website I made on 2021-12-01
dec21_content = requests.get('https://raw.githubusercontent.com/EdwardJRoss/skeptric/98419583bc0c7b71ab9469250bbed924cdac448d/static/resources/about.html' ).content
And it’s byte-for-byte the same are the snapshot
dec21_content == original_content
sha1_digest(dec21_content)
'Z5NRUTRW3XTKZDCJFDKGPJ5BWIBNQCG7'
Restoring Links
By looking through the links we can see that there arey are prefixed with http://web.archive.org/web/<TIMESTAMP>
with an extra cs_
for CSS and js_
for Javascript (and im_
for images, not shown here).
import re
re.findall(b'(?:href|src)="([^"]*)"' , wayback_content)
[b'//archive.org/includes/analytics.js?v=cf34f82',
b'/_static/js/bundle-playback.js?v=UfTkgsKx',
b'/_static/js/wombat.js?v=UHAOicsW',
b'/_static/css/banner-styles.css?v=omkqRugM',
b'/_static/css/iconochive.css?v=qtvMKcIJ',
b'http://web.archive.org/web/20211120235913cs_/https://skeptric.com/style.main.min.5ea2f07be7e07e221a7112a3095b89d049b96c48b831f16f1015bf2d95d914e5.css',
b'http://web.archive.org/web/20211120235913js_/https://www.googletagmanager.com/gtag/js?id=UA-167481545-1',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/',
b'/web/20211120235913/https://skeptric.com/about/',
b'/web/20211120235913/https://skeptric.com/',
b'http://web.archive.org/web/20211120235913/https://www.whatcar.xyz/',
b'http://web.archive.org/web/20211120235913/https://github.com/EdwardJRoss/whatcar',
b'http://web.archive.org/web/20211120235913/https://github.com/EdwardJRoss/job-advert-analysis',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/moving-averages-sql/',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/searching-100b-pages-cdx/',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/common-crawl-index-athena/',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/minhash-lsh/',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/calculate-centroid-on-sphere/',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/constant-models/',
b'http://web.archive.org/web/20211120235913/https://github.com/EdwardJRoss/all_of_statistics_exercises',
b'http://web.archive.org/web/20211120235913/https://github.com/EdwardJRoss/regression_stories',
b'http://web.archive.org/web/20211120235913/https://github.com/EdwardJRoss/regression_stories',
b'http://web.archive.org/web/20211120235913/https://github.com/EdwardJRoss/mlzero',
b'http://web.archive.org/web/20211120235913/mailto:webmaster@skeptric.com',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/reading-list/',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/',
b'http://web.archive.org/web/20211120235913/https://skeptric.com/',
b'http://web.archive.org/web/20211120235913/https://twitter.com/EddingtonRoss',
b'http://web.archive.org/web/20211120235913/https://github.com/edwardjross/',
b'http://web.archive.org/web/20211120235913/https://www.linkedin.com/in/edward-ross-4909ba13a/',
b'http://web.archive.org/web/20211120235913js_/https://polyfill.io/v3/polyfill.min.js?features=es6',
b'http://web.archive.org/web/20211120235913js_/https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js',
b'http://web.archive.org/web/20211120235913js_/https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js']
We can rewrite them like this:
def remove_wayback_links(content: bytes , timestamp: str ) -> bytes :
# Remove web links
timestamp = timestamp.encode('ascii' )
content = content.replace(b'http://web.archive.org' , b'' )
for prefix in [b'' , b'im_' , b'js_' , b'cs_' ]:
content = content.replace(b'/web/' + timestamp + prefix + b'/' , b'' )
return content
re.findall(b'(?:href|src)="([^"]*)"' , remove_wayback_links(wayback_content, record.timestamp))
[b'//archive.org/includes/analytics.js?v=cf34f82',
b'/_static/js/bundle-playback.js?v=UfTkgsKx',
b'/_static/js/wombat.js?v=UHAOicsW',
b'/_static/css/banner-styles.css?v=omkqRugM',
b'/_static/css/iconochive.css?v=qtvMKcIJ',
b'https://skeptric.com/style.main.min.5ea2f07be7e07e221a7112a3095b89d049b96c48b831f16f1015bf2d95d914e5.css',
b'https://www.googletagmanager.com/gtag/js?id=UA-167481545-1',
b'https://skeptric.com/',
b'https://skeptric.com/about/',
b'https://skeptric.com/',
b'https://www.whatcar.xyz/',
b'https://github.com/EdwardJRoss/whatcar',
b'https://github.com/EdwardJRoss/job-advert-analysis',
b'https://skeptric.com/',
b'https://skeptric.com/moving-averages-sql/',
b'https://skeptric.com/searching-100b-pages-cdx/',
b'https://skeptric.com/common-crawl-index-athena/',
b'https://skeptric.com/minhash-lsh/',
b'https://skeptric.com/calculate-centroid-on-sphere/',
b'https://skeptric.com/constant-models/',
b'https://github.com/EdwardJRoss/all_of_statistics_exercises',
b'https://github.com/EdwardJRoss/regression_stories',
b'https://github.com/EdwardJRoss/regression_stories',
b'https://github.com/EdwardJRoss/mlzero',
b'mailto:webmaster@skeptric.com',
b'https://skeptric.com/reading-list/',
b'https://skeptric.com/',
b'https://skeptric.com/',
b'https://twitter.com/EddingtonRoss',
b'https://github.com/edwardjross/',
b'https://www.linkedin.com/in/edward-ross-4909ba13a/',
b'https://polyfill.io/v3/polyfill.min.js?features=es6',
b'https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js',
b'https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js']
And the rest
We can put all our changes together
def remove_wayback_changes(content, timestamp):
content = remove_wayback_header(content)
content = remove_wayback_footer(content)
content = remove_wayback_links(content, timestamp)
return content
clean_wayback_content = remove_wayback_changes(wayback_content, record['timestamp' ])
clean_wayback_content == original_content
from difflib import SequenceMatcher
seqmatcher = SequenceMatcher(isjunk= None ,
a= original_content,
b= clean_wayback_content,
autojunk= False )
context_before = context_after = 20
for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():
if tag == 'equal' :
continue
a_min = max (a0 - context_before, 0 )
a_max = min (a1 + context_after, len (seqmatcher.a))
print (seqmatcher.a[a_min:a_max])
b_min = max (b0 - context_before, 0 )
b_max = min (b1 + context_after, len (seqmatcher.b))
print (seqmatcher.b[b_min:b_max])
print ()
b'meta charset="utf-8" />\n <meta http-eq'
b'meta charset="utf-8"/>\n <meta http-eq'
b'e" content="IE=edge" />\n\n \n \n <t'
b'e" content="IE=edge"/>\n\n \n \n <t'
b'ndly" content="True" />\n <meta name="v'
b'ndly" content="True"/>\n <meta name="v'
b', initial-scale=1.0" />\n\n \n <link r'
b', initial-scale=1.0"/>\n\n \n <link r'
b'015bf2d95d914e5.css" />\n<script async src'
b'015bf2d95d914e5.css"/>\n<script async src'
b'"menuitem"><a href="/about/">About</a></'
b'"menuitem"><a href="https://skeptric.com/about/">About</a></'
b'"menuitem"><a href="/">Home</a></li>\n '
b'"menuitem"><a href="https://skeptric.com/">Home</a></li>\n '
b'https://skeptric.com">skeptric.com</a>.<'
b'https://skeptric.com/">skeptric.com</a>.<'
We can mangle the original HTML to get the same result. I doubt this would be general enough to work on other pages, but gives a flavour of the changes.
import re
def wayback_normalise_content(content, base_url):
url = base_url.encode('ascii' )
content = re.sub(b' */>' , b'/>' , content)
content = content.replace(b'href="/' , b'href="' + url + b'/' )
content = re.sub(b'href="' + url + b'"' , b'href="' + url + b'/"' , content)
return content
assert wayback_normalise_content(original_content, 'https://skeptric.com' ) == clean_wayback_content