In reply to Ana.
An actual salad on the side, as pictured, makes “salad-y.”
version ai :
python
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_full_article_html(url):
“””
Fetches the HTML content of an article from a given URL,
rewrites image and link URLs to be absolute, and returns the modified HTML.
“””
try:
response = requests.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
except requests.exceptions.RequestException as e:
return f”Error fetching URL: {e}”
soup = BeautifulSoup(response.text, ‘html.parser’)
# Find the main article content. For Smitten Kitchen, this is typically within
if not article_content:
# Fallback to a broader search if entry-content isn’t found
# (e.g., the main
article_content = soup.find(‘article’)
if not article_content:
return “Could not find the main article content (div.entry-content or article tag).”
# Get the base URL for resolving relative paths
base_url = url.split(‘#’)[0] # Remove fragment identifier
# Rewrite image src attributes to be absolute
for img_tag in article_content.find_all(‘img’):
original_src = img_tag.get(‘src’)
data_lazy_src = img_tag.get(‘data-lazy-src’)
srcset = img_tag.get(‘srcset’)
data_lazy_srcset = img_tag.get(‘data-lazy-srcset’)
# Prioritize src, then data-lazy-src
if original_src:
img_tag[‘src’] = urljoin(base_url, original_src)
elif data_lazy_src:
img_tag[‘src’] = urljoin(base_url, data_lazy_src)
# Remove data-lazy-src if we moved it to src
del img_tag[‘data-lazy-src’]
# Rewrite srcset if present
if srcset:
updated_srcset_parts = []
for part in srcset.split(‘,’):
url_part, *rest = part.strip().split(‘ ‘)
updated_srcset_parts.append(urljoin(base_url, url_part) + ‘ ‘ + ‘ ‘.join(rest))
img_tag[‘srcset’] = ‘, ‘.join(updated_srcset_parts)
# Rewrite data-lazy-srcset if present (and not already handled by srcset)
if data_lazy_srcset and not srcset: # Only if srcset wasn’t the source
updated_srcset_parts = []
for part in data_lazy_srcset.split(‘,’):
url_part, *rest = part.strip().split(‘ ‘)
updated_srcset_parts.append(urljoin(base_url, url_part) + ‘ ‘ + ‘ ‘.join(rest))
img_tag[‘data-lazy-srcset’] = ‘, ‘.join(updated_srcset_parts)
# If src was empty and we populated it with data-lazy-src, remove data-lazy-srcset as well
if not original_src and data_lazy_src and ‘data-lazy-srcset’ in img_tag.attrs:
del img_tag[‘data-lazy-srcset’]
# Remove lazy loading classes if they might interfere when rendering the static HTML
if ‘jetpack-lazy-image’ in img_tag.get(‘class’, []):
img_tag[‘class’].remove(‘jetpack-lazy-image’)
if ‘jetpack-lazy-image–handled’ in img_tag.get(‘class’, []):
img_tag[‘class’].remove(‘jetpack-lazy-image–handled’)
# Ensure loading attribute is not ‘lazy’ if we want immediate display
if img_tag.get(‘loading’) == ‘lazy’:
img_tag[‘loading’] = ‘eager’
# Rewrite link href attributes to be absolute
for a_tag in article_content.find_all(‘a’):
href = a_tag.get(‘href’)
if href:
a_tag[‘href’] = urljoin(base_url, href)
# Return the cleaned HTML content
return str(article_content)
# The specific URL provided points to a comment, but the request implies the full article.
# Smitten Kitchen article URLs are typically in the format:
# https://smittenkitchen.com/YYYY/MM/article-slug/
# So, for the hash brown patties, the base URL is:
article_url = “https://smittenkitchen.com/2023/04/hash-brown-patties/”
final_article_html = get_full_article_html(article_url)
print(final_article_html)
