import requests from bs4 import BeautifulSoup import json def scrape_forum(url): res = requests.get(url) soup = BeautifulSoup(res.text, "html.parser") postlefts = soup.find_all("div", class_="postleft") postmsgs = soup.find_all("div", class_="postmsg") results = [] mois_map = { "janvier": "01", "février": "02", "mars": "03", "avril": "04", "mai": "05", "juin": "06", "juillet": "07", "août": "08", "septembre": "09", "octobre": "10", "novembre": "11", "décembre": "12" } for left, msg in zip(postlefts, postmsgs): auteur_tag = left.find("strong") auteur = auteur_tag.get_text(strip=True) if auteur_tag else "Inconnu" dt_tags = left.find_all("dt") date_str = dt_tags[-1].get_text(strip=True) if dt_tags else "Date inconnue" try: parts = date_str.split() jour = parts[0] mois = mois_map.get(parts[1].lower(), "01") annee = parts[2] date_iso = f"{annee}-{mois}-{jour.zfill(2)}" except: date_iso = "" texte = msg.get_text(separator="\n", strip=True) results.append({ "author": auteur, "date": date_str, "date_iso": date_iso, "content": texte, "tags": ["alcool","nalméfène","nalmefene"] }) return results def append_to_html(posts, output_file="scrappharma.html", source_url=""): try: with open(output_file, "r", encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") except FileNotFoundError: soup = BeautifulSoup(""" Scrap Pharma

Archive des posts

""", "html.parser") container = soup.find("div", id="posts-container") if container is None: container = soup.new_tag("div", id="posts-container") soup.body.append(container) for post in posts: post_div = soup.new_tag("div", **{"class": "post"}) author = soup.new_tag("div", **{"class": "pseudo"}) author.string = f" {post['author']}" date = soup.new_tag("div", **{"class": "date"}) date.string = f" {post['date']}" content = soup.new_tag("pre") content.string = post["content"] post_div.append(author) post_div.append(date) post_div.append(content) container.append(post_div) with open(output_file, "w", encoding="utf-8") as f: f.write(soup.prettify()) def export_to_json(posts, output_file="scrappharma.json"): try: with open(output_file, "r", encoding="utf-8") as f: existing_posts = json.load(f) except (FileNotFoundError, json.JSONDecodeError): existing_posts = [] all_posts = existing_posts + posts with open(output_file, "w", encoding="utf-8") as f: json.dump(all_posts, f, ensure_ascii=False, indent=4) if __name__ == "__main__": url = "https://www.psychoactif.org/forum/t16038-p1-Selincro-avez-vous-reussi-baisser-vos-consommations-alcool.html" posts = scrape_forum(url) append_to_html(posts, source_url=url) export_to_json(posts) print(f"{len(posts)} posts ajoutés dans scrappharma.html et scrappharma.json")