Fix broken Facebook profile parsing (#217)

Copilot · soxoj · web-flow · commit 0be81c20143b · 2026-04-02T18:41:17.000+02:00
* Initial plan * Fix Facebook parsing: use meta tags + facebookexternalhit UA Facebook changed their page structure: the old __bbox JSON regex with "complete" and "sequence_number" no longer exists, and the flag <title>Facebook</title> doesn't match public profiles (which have user-specific titles). Additionally, the default Chrome User-Agent causes Facebook to redirect to login. Fix by: - Switching to BeautifulSoup meta tag extraction (og:title, og:url, og:image, og:description, al:android:url for uid) - Updating flags to match public profile pages - Adding url_mutations with facebookexternalhit User-Agent so Facebook serves actual page content Agent-Logs-Url: https://github.com/soxoj/socid-extractor/sessions/e66dfc76-d822-40b9-ad17-6af6d2e4e19b Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com>
diff --git a/socid_extractor/schemes.py b/socid_extractor/schemes.py
@@ -67,22 +67,22 @@
     },
     'Facebook user profile': {
         'url_hints': ('facebook.com', 'fb.com', 'm.facebook.com'),
-        'flags': ['<html id="facebook"', '<title>Facebook</title>'],
-        'regex': r'({"__bbox":{"complete".+"sequence_number":0}})',
-        'extract_json': True,
-        'transforms': [
-            json.loads,
-            lambda x: x['result']['data']['user'],
-            json.dumps,
-        ],
+        'flags': ['<html id="facebook"', 'property="og:title"'],
+        'bs': True,
         'fields': {
-            'uid': lambda x: x.get('id'),
-            'username': lambda x: x.get('url').split('/')[-1],
-            'fullname': lambda x: x.get('name'),
-            'is_verified': lambda x: x.get('is_verified'),
-            'image': lambda x: x.get('profile_picture_for_sticky_bar', {}).get('uri', ''),
-            'image_bg': lambda x: x.get('cover_photo', {}).get('photo', {}).get('image', {}).get('uri', ''),
-        }
+            'uid': lambda x: x.find('meta', {'property': 'al:android:url'})['content'].replace('fb://profile/', ''),
+            'username': lambda x: x.find('meta', {'property': 'og:url'})['content'].strip('/').split('/')[-1],
+            'fullname': lambda x: x.find('meta', {'property': 'og:title'})['content'],
+            'description': lambda x: x.find('meta', {'property': 'og:description'})['content'],
+            'image': lambda x: x.find('meta', {'property': 'og:image'})['content'],
+        },
+        'url_mutations': [
+            {
+                'from': r'https?://(?:[\w-]+\.)?(?:facebook\.com|fb\.com)/(?P<username>[^/?#]+)',
+                'to': 'https://www.facebook.com/{username}',
+                'headers': {'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'},
+            },
+        ],
     },
     'Facebook group': {
         'url_hints': ('facebook.com', 'fb.com'),
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
@@ -218,17 +218,16 @@ def test_reddit():  # Broken. Site move onto new version. Finding the right cook
     assert int(info.get('post_karma')) > int(7000)
 
 
-@pytest.mark.skip(reason="needs deeper rework")
+@pytest.mark.skip(reason="requires facebookexternalhit UA; use url_mutations via CLI")
 @pytest.mark.github_failed
-def test_facebook_user_profile():  # Broken. Needs deeper rework
-    info = extract(parse('https://ru-ru.facebook.com/anatolijsharij/')[0])
+def test_facebook_user_profile():
+    info = extract(parse('https://www.facebook.com/zuck/',
+                         headers={'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'})[0])
 
-    assert info.get('uid') == '1486042157'
-    assert info.get('username') == 'anatolijsharij'
-    assert info.get('fullname') == 'Анатолий Шарий'
-    assert info.get('is_verified') == 'True'
+    assert info.get('uid') == '4'
+    assert info.get('username') == 'zuck'
+    assert info.get('fullname') == 'Mark Zuckerberg'
     assert 'image' in info
-    assert 'image_bg' in info
     assert 'all' not in info
 
 
diff --git a/tests/test_socid_improvements.py b/tests/test_socid_improvements.py
@@ -1245,3 +1245,47 @@ def test_roblox_html_profile():
     assert info.get('username') == 'john'
     assert info.get('uid') == '2191'
     assert 'rbxcdn.com' in info.get('image', '')
+
+
+def test_facebook_user_profile_meta_tags():
+    """
+    Verifies the **Facebook user profile** scheme extracts data from OG and app-link
+    meta tags (the format served to crawlers by Facebook).
+
+    **Check:** `uid`, `username`, `fullname`, `description`, and `image` are extracted
+    from the meta tags in the HTML fixture.
+    """
+    html = (
+        '<!DOCTYPE html>'
+        '<html id="facebook" class="_9dls" lang="en" dir="ltr"><head>'
+        '<title>Mark Zuckerberg</title>'
+        '<meta property="al:android:app_name" content="Facebook" />'
+        '<meta property="al:android:url" content="fb://profile/4" />'
+        '<meta property="og:title" content="Mark Zuckerberg" />'
+        '<meta property="og:description" content="Mark Zuckerberg. 121,000,000 likes" />'
+        '<meta property="og:url" content="https://www.facebook.com/zuck/" />'
+        '<meta property="og:image" content="https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=4" />'
+        '</head><body></body></html>'
+    )
+    info = extract(html)
+    assert info.get('uid') == '4'
+    assert info.get('username') == 'zuck'
+    assert info.get('fullname') == 'Mark Zuckerberg'
+    assert 'likes' in info.get('description', '')
+    assert 'lookaside' in info.get('image', '')
+
+
+def test_facebook_user_profile_no_match_without_og_title():
+    """
+    Verifies the **Facebook user profile** scheme does NOT match pages that lack
+    ``og:title`` meta tag (e.g. login/error pages).
+    """
+    html = (
+        '<!DOCTYPE html>'
+        '<html id="facebook" lang="en"><head>'
+        '<title>Error</title>'
+        '</head><body><h1>Sorry, something went wrong.</h1></body></html>'
+    )
+    info = extract(html)
+    assert not info.get('uid')
+    assert not info.get('fullname')