Incidentally, if you're on a platform where you can reasonably install cohost.py and you can responsibly vet an internet rando's python3 script before running it on your machine, you can use this to dump the text of all posts from a specific Cohost page of yours to individual files in the current working directory, named in the pattern postID_whenposted.md
I have made Choices in order to preserve some out-of-band information: post headline, CWs, and tags. I make no attempt to retrieve anything except text; archiving posted images is outside my use case.
#!/usr/bin/python3
from pathlib import Path
from cohost.models.user import User
from cohost.models.post import Post
from sys import exit
cookie = '' # copy-paste from web browser devtools as per https://github.com/valknight/Cohost.py#retrieving-your-cookie
projectName = 'caffeinatedOtter' # or whatever your @pagename is
try:
user = User.loginWithCookie(cookie)
project = user.getProject(projectName)
except:
exit('Cohost login failed!')
here = Path.cwd()
page = 0
while True:
postlist = project.getPosts(page)
if len(postlist) > 0:
page = page + 1
for post in postlist:
md = post.plainTextBody
if len(post.contentWarnings) > 0:
cwList = '\n'.join(post.contentWarnings)
md = f"<!-- CWs:\n{cwList}\n-->\n{md}"
if len(post.tags) > 0:
tagList = '\n'.join(post.tags)
md = f"<!-- tags:\n{tagList}\n-->\n{md}"
if len(post.headline) > 0:
md = f"# {post.headline} <!-- Cohost headline -->\n\n{md}"
if len(md) > 0:
filename = here / f"{post.postId}_{post.publishedAt}.md"
filename.write_text(md)
else:
print('Done.')
exit(0)
For people trying this on Windows that are like "Wait, it's complaining! Help!"
There's 2 potential problems:
-
The filename created by this doesn't work on Windows, because post.publishedAt contains the invalid (for Windows) character : in the times.
How do we fix this?
On line 35
filename = here / f"{post.postId}_{post.publishedAt}.md"
becomes
filename = here / f"{post.postId}_{post.publishedAt.replace(':','-')}.md" -
You might have, like me, pasted emoji straight into your text. The script doesn't like that either, because you'll get a
UnicodeEncodeError: 'charmap' codec can't encode character '\Usomethingorother' in position somewhere: character maps to <undefined>error.
Have no fear, however, the fix is simple enough:
On line 36
filename.write_text(md)
becomes
filename.write_text(md, encoding="utf-8")
I have discovered there is at least one (1) typo in cohost.py1
(transparentShareOfPostId tries to get transparentShareofPostId)
and also that you can't trust all the data you get back.
Somehow, some of the page URLs for posts are 404?
*confused dog noises*
Edit: Actually, that's not true.
The page is there.
The page exists.
I can go to it.
The server sometimes returns a 404.
*dog noises continue*
I have Pivoted My Code Again (because of course), and think the New Approach will work even better.
🤞
The only thing reasonably left to do is like... comments I guess?
Pictures, maybe?
But... eh.
The final leaf doesn't have a poster because duh, It Me! Always!
yes of course I now also archive images because hello did you see my last chost?
Imagine coming across that in the future without the images.
Doesn't bear thinking about.
The fix is live now.
I should clean up my code and release it, huh?
#!/usr/bin/python3
# Extended Cohost Archiver
# Gets chosts for a page, with full share trees, and image attachments
#
# Original by Caffeinated Otter
# Mauled by MiserablePileOfWords
#
# Post with install and usage instructions here:
# https://cohost.org/caffeinatedOtter/post/5064344-incidentally-if-you
from pathlib import Path
from cohost.models.user import User
from cohost.models.post import Post
from urllib.request import urlretrieve
from urllib.parse import urlparse
from os.path import splitext
from sys import argv, exit
# copy-paste from web browser devtools as per https://github.com/valknight/Cohost.py#retrieving-your-cookie
cookie = 'REPLACEME'
# your pagename here, with correct capitalisation. It does need to be one of *your* pages
projectName = 'REPLACEME'
# output directory. Make sure to run the script from the correct directory, or set your IDE's workdir accordingly.
here = Path.cwd()
# utility function that tries to fix the markdown so it's not a single blob
def convert_textblock(input):
mainbody = input.splitlines()
result = ""
codeblock = False
for line in mainbody:
if len(line) == 0 or len(line.strip(' \n\t')) == 0:
if not (codeblock):
result = f"{result}\n"
else:
if line.lstrip().startswith("```"):
codeblock = not codeblock
if codeblock:
result = f"{result}{line.rstrip()}\n" # don't touch code blocks
else:
result = f"{result}{line.rstrip()} \n" # add a markdown newline for text
return result
try:
user = User.loginWithCookie(cookie)
project = user.getProject(projectName)
except Exception as e:
print(e)
exit('Cohost login failed!')
page = 0
while True:
try:
postlist = project.getPosts(page)
except:
exit(f"Failed on page {page}")
if len(postlist) > 0:
page = page + 1
for post in postlist:
md = convert_textblock(post.plainTextBody)
wasShare = False
try:
if post.transparentShareOfPostId is not None:
md = f"<!-- Share Of {post.transparentShareOfPostId} -->\n{md}"
wasShare = True
except:
pass
finally:
pass
# give blank shares a different name to differentiate them, since they have nothing *we* added, except maybe tags
if wasShare:
filename = here / f"{post.postId}_SHARE_{post.publishedAt.replace(':','-')}.md"
else:
filename = here / f"{post.postId}_{post.publishedAt.replace(':','-')}.md"
# only download newer posts. Remove this whole if block if you want to get everything every time you run the script for some reason
if filename.exists(): # if we've already written this one, there's no sense in doing it again
# that probably means we're out of newer posts and everything else should be on disk already
exit(f"Exiting, found existing post found for {post.publishedAt}")
if len(post.contentWarnings) > 0:
cwList = '\n'.join(post.contentWarnings)
md = f"<!-- CWs:\n{cwList}\n-->\n{md}"
if len(post.tags) > 0:
tagList = '\n'.join(post.tags)
md = f"<!-- tags:\n{tagList}\n-->\n{md}"
md = f"<!-- URL: {post.url} --> \n{md}"
# checks for image attachments, and inlines them into the markdown if they could be downloaded
# note that they're full size, which kinda sucks, but I don't want to spend my time resizing them
attachmentBlockText = ""
if len(post.blocks) > 0:
attachmentCount = 0
for block in post.blocks:
attachmentCount = attachmentCount + 1
if 'type' in block and block['type'] == "attachment":
if 'attachment' in block:
infoBlock = block['attachment']
if 'kind' in infoBlock and infoBlock['kind'] == "image":
fileURL = infoBlock['fileURL']
parsedURL = urlparse(fileURL)
imageName , extension = splitext(parsedURL.path)
imageFile = here / f"{post.postId}_ATTACHMENT_{attachmentCount}{extension}"
altText = ""
if 'altText' in infoBlock:
altText = infoBlock['altText']
altText = altText.replace('"', '\'')
# try to get our image
try:
urlretrieve(fileURL, imageFile.resolve())
attachmentBlockText = f"{attachmentBlockText}\n<!-- image attachment {attachmentCount} from {fileURL} -->\n} \"{altText}\")\n"
except Exception as e:
print(e)
attachmentBlockText = f"{attachmentBlockText}\n<!-- failed to get image attachment {attachmentCount} from {fileURL} -->\n"
md = f"{attachmentBlockText}\n{md}\n"
if len(post.headline) > 0:
md = f"# {post.headline} <!-- Cohost headline -->\n\n{md}"
else:
md = f"<!-- No headline -->\n{md}"
# is this a share of something we *added* something to? Rebuild the entire tree for context
if len(post.shareTree) > 0:
blockCount = 1
treeBlockText = ""
for block in post.shareTree:
blockCWs = "(none)"
blockTags = "(none)"
blockHeadline = ""
poster = ""
OGpostID = blockCount
if 'postId' in block:
OGpostID = block['postId']
originalPostURL = "[UNKNOWN]"
if 'singlePostPageUrl' in block:
originalPostURL = block['singlePostPageUrl']
if 'postingProject' in block:
if 'handle' in block['postingProject']:
poster = block['postingProject']['handle']
shareText = ""
if 'plainTextBody' in block:
shareText = block['plainTextBody']
if len(block['headline']) > 0:
blockHeadline = f"# {block['headline']}"
if len(block['cws']) > 0:
blockCWs = '\n'.join(block['cws'])
if len(block['tags']) > 0:
blockTags = '\n'.join(block['tags'])
if blockCount > 1:
treeBlockText = f"{treeBlockText}\n____\n____\n" # add breaks between blocks
blockImageAttachments = ""
if 'blocks' in block and len(block['blocks']) > 0:
attachmentCount = 0
for block in block['blocks']:
attachmentCount = attachmentCount + 1
if 'type' in block and block['type'] == "attachment":
if 'attachment' in block:
infoBlock = block['attachment']
if 'kind' in infoBlock and infoBlock['kind'] == "image":
fileURL = infoBlock['fileURL']
parsedURL = urlparse(fileURL)
imageName , extension = splitext(parsedURL.path)
imageFile = here / f"{post.postId}_SHARED_{OGpostID}_ATTACHMENT_{attachmentCount}{extension}"
altText = ""
if 'altText' in infoBlock:
altText = infoBlock['altText']
altText = altText.replace('"', '\'')
# get our image
try:
urlretrieve(fileURL, imageFile.resolve())
blockImageAttachments = f"{blockImageAttachments}\n<!-- image attachment {attachmentCount} from {fileURL} -->\n} \"{altText}\")\n"
except Exception as e:
print(e)
blockImageAttachments = f"{blockImageAttachments}\n<!-- failed to get image attachment {attachmentCount} from {fileURL} -->\n"
treeBlockText = f"{treeBlockText}<!-- block {blockCount} -->\n<sub>from: {poster}</sub>\n{blockHeadline}\n\n<!-- original URL: {originalPostURL} -->\n<!-- CWs: {blockCWs} -->\n<!-- Tags: {blockTags} -->\n\n{blockImageAttachments}\n{convert_textblock(shareText)}\n"
blockCount = blockCount + 1
# if we have blocks, add them before the main post body
if len(treeBlockText) > 0:
md = f"{treeBlockText}\n____\n____\n{md}"
filename.write_text(md, encoding="utf-8")
else:
print('Done.')
exit(0)
