Skip to content

Commit 0be81c2

Browse files
Copilotsoxoj
andauthored
Fix broken Facebook profile parsing (#217)
* Initial plan * Fix Facebook parsing: use meta tags + facebookexternalhit UA Facebook changed their page structure: the old __bbox JSON regex with "complete" and "sequence_number" no longer exists, and the flag <title>Facebook</title> doesn't match public profiles (which have user-specific titles). Additionally, the default Chrome User-Agent causes Facebook to redirect to login. Fix by: - Switching to BeautifulSoup meta tag extraction (og:title, og:url, og:image, og:description, al:android:url for uid) - Updating flags to match public profile pages - Adding url_mutations with facebookexternalhit User-Agent so Facebook serves actual page content Agent-Logs-Url: https://github.com/soxoj/socid-extractor/sessions/e66dfc76-d822-40b9-ad17-6af6d2e4e19b Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com>
1 parent 7d7f654 commit 0be81c2

3 files changed

Lines changed: 66 additions & 23 deletions

File tree

socid_extractor/schemes.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -67,22 +67,22 @@
6767
},
6868
'Facebook user profile': {
6969
'url_hints': ('facebook.com', 'fb.com', 'm.facebook.com'),
70-
'flags': ['<html id="facebook"', '<title>Facebook</title>'],
71-
'regex': r'({"__bbox":{"complete".+"sequence_number":0}})',
72-
'extract_json': True,
73-
'transforms': [
74-
json.loads,
75-
lambda x: x['result']['data']['user'],
76-
json.dumps,
77-
],
70+
'flags': ['<html id="facebook"', 'property="og:title"'],
71+
'bs': True,
7872
'fields': {
79-
'uid': lambda x: x.get('id'),
80-
'username': lambda x: x.get('url').split('/')[-1],
81-
'fullname': lambda x: x.get('name'),
82-
'is_verified': lambda x: x.get('is_verified'),
83-
'image': lambda x: x.get('profile_picture_for_sticky_bar', {}).get('uri', ''),
84-
'image_bg': lambda x: x.get('cover_photo', {}).get('photo', {}).get('image', {}).get('uri', ''),
85-
}
73+
'uid': lambda x: x.find('meta', {'property': 'al:android:url'})['content'].replace('fb://profile/', ''),
74+
'username': lambda x: x.find('meta', {'property': 'og:url'})['content'].strip('/').split('/')[-1],
75+
'fullname': lambda x: x.find('meta', {'property': 'og:title'})['content'],
76+
'description': lambda x: x.find('meta', {'property': 'og:description'})['content'],
77+
'image': lambda x: x.find('meta', {'property': 'og:image'})['content'],
78+
},
79+
'url_mutations': [
80+
{
81+
'from': r'https?://(?:[\w-]+\.)?(?:facebook\.com|fb\.com)/(?P<username>[^/?#]+)',
82+
'to': 'https://www.facebook.com/{username}',
83+
'headers': {'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'},
84+
},
85+
],
8686
},
8787
'Facebook group': {
8888
'url_hints': ('facebook.com', 'fb.com'),

tests/test_e2e.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -218,17 +218,16 @@ def test_reddit(): # Broken. Site move onto new version. Finding the right cook
218218
assert int(info.get('post_karma')) > int(7000)
219219

220220

221-
@pytest.mark.skip(reason="needs deeper rework")
221+
@pytest.mark.skip(reason="requires facebookexternalhit UA; use url_mutations via CLI")
222222
@pytest.mark.github_failed
223-
def test_facebook_user_profile(): # Broken. Needs deeper rework
224-
info = extract(parse('https://ru-ru.facebook.com/anatolijsharij/')[0])
223+
def test_facebook_user_profile():
224+
info = extract(parse('https://www.facebook.com/zuck/',
225+
headers={'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'})[0])
225226

226-
assert info.get('uid') == '1486042157'
227-
assert info.get('username') == 'anatolijsharij'
228-
assert info.get('fullname') == 'Анатолий Шарий'
229-
assert info.get('is_verified') == 'True'
227+
assert info.get('uid') == '4'
228+
assert info.get('username') == 'zuck'
229+
assert info.get('fullname') == 'Mark Zuckerberg'
230230
assert 'image' in info
231-
assert 'image_bg' in info
232231
assert 'all' not in info
233232

234233

tests/test_socid_improvements.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,3 +1245,47 @@ def test_roblox_html_profile():
12451245
assert info.get('username') == 'john'
12461246
assert info.get('uid') == '2191'
12471247
assert 'rbxcdn.com' in info.get('image', '')
1248+
1249+
1250+
def test_facebook_user_profile_meta_tags():
1251+
"""
1252+
Verifies the **Facebook user profile** scheme extracts data from OG and app-link
1253+
meta tags (the format served to crawlers by Facebook).
1254+
1255+
**Check:** `uid`, `username`, `fullname`, `description`, and `image` are extracted
1256+
from the meta tags in the HTML fixture.
1257+
"""
1258+
html = (
1259+
'<!DOCTYPE html>'
1260+
'<html id="facebook" class="_9dls" lang="en" dir="ltr"><head>'
1261+
'<title>Mark Zuckerberg</title>'
1262+
'<meta property="al:android:app_name" content="Facebook" />'
1263+
'<meta property="al:android:url" content="fb://profile/4" />'
1264+
'<meta property="og:title" content="Mark Zuckerberg" />'
1265+
'<meta property="og:description" content="Mark Zuckerberg. 121,000,000 likes" />'
1266+
'<meta property="og:url" content="https://www.facebook.com/zuck/" />'
1267+
'<meta property="og:image" content="https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=4" />'
1268+
'</head><body></body></html>'
1269+
)
1270+
info = extract(html)
1271+
assert info.get('uid') == '4'
1272+
assert info.get('username') == 'zuck'
1273+
assert info.get('fullname') == 'Mark Zuckerberg'
1274+
assert 'likes' in info.get('description', '')
1275+
assert 'lookaside' in info.get('image', '')
1276+
1277+
1278+
def test_facebook_user_profile_no_match_without_og_title():
1279+
"""
1280+
Verifies the **Facebook user profile** scheme does NOT match pages that lack
1281+
``og:title`` meta tag (e.g. login/error pages).
1282+
"""
1283+
html = (
1284+
'<!DOCTYPE html>'
1285+
'<html id="facebook" lang="en"><head>'
1286+
'<title>Error</title>'
1287+
'</head><body><h1>Sorry, something went wrong.</h1></body></html>'
1288+
)
1289+
info = extract(html)
1290+
assert not info.get('uid')
1291+
assert not info.get('fullname')

0 commit comments

Comments
 (0)