@@ -73,15 +73,26 @@ public function collectData()
7373
7474 $ jsonBlobs = $ html ->find ('script[type="application/json"] ' );
7575
76- $ gatheredCodes = [];
76+ $ gatheredPosts = [];
7777 $ limit = $ this ->getInput ('limit ' );
7878 foreach ($ jsonBlobs as $ jsonBlob ) {
79- // The structure of the JSON document is likely to change, but we're looking for a "code" inside a "post"
80- foreach ($ this ->recursiveFind ($ this ->recursiveFind (json_decode ($ jsonBlob ->innertext ), 'post ' ), 'code ' ) as $ candidateCode ) {
79+ // The structure of the JSON document is likely to change, but we're looking for "post" objects
80+ foreach ($ this ->recursiveFind (json_decode ($ jsonBlob ->innertext ), 'post ' ) as $ post ) {
81+ if (!is_object ($ post ) && !is_array ($ post )) {
82+ continue ;
83+ }
84+ $ post = (array )$ post ;
85+ if (!isset ($ post ['code ' ])) {
86+ continue ;
87+ }
88+ $ candidateCode = $ post ['code ' ];
8189 // code should be like CzZk4-USq1O or Cy3m1VnRiwP or Cywjyrdv9T6 or CzZk4-USq1O
82- if (grapheme_strlen ($ candidateCode ) == 11 and !in_array ($ candidateCode , $ gatheredCodes )) {
83- $ gatheredCodes [] = $ candidateCode ;
84- if (count ($ gatheredCodes ) >= $ limit ) {
90+ if (grapheme_strlen ($ candidateCode ) == 11 and !isset ($ gatheredPosts [$ candidateCode ])) {
91+ $ gatheredPosts [$ candidateCode ] = [
92+ 'code ' => $ candidateCode ,
93+ 'taken_at ' => $ post ['taken_at ' ] ?? null ,
94+ ];
95+ if (count ($ gatheredPosts ) >= $ limit ) {
8596 break 2 ;
8697 }
8798 }
@@ -91,10 +102,10 @@ public function collectData()
91102 $ this ->feedName = html_entity_decode ($ html ->find ('meta[property=og:title] ' , 0 )->content );
92103 // todo: meta[property=og:description] could populate the feed description
93104
94- foreach ($ gatheredCodes as $ postCode ) {
105+ foreach ($ gatheredPosts as $ postData ) {
95106 $ item = [];
96107 // post URL is like: https://www.threads.net/@zuck/post/Czrr520PZfh
97- $ item ['uri ' ] = $ this ->getURI () . '/post/ ' . $ postCode ;
108+ $ item ['uri ' ] = $ this ->getURI () . '/post/ ' . $ postData [ ' code ' ] ;
98109 $ articleHtml = getSimpleHTMLDOMCached ($ item ['uri ' ], 15778800 ); // cache time: six months
99110
100111 // Relying on meta tags ought to be more reliable.
@@ -111,7 +122,11 @@ public function collectData()
111122 }
112123
113124 // todo: parse hashtags out of content for $item['categories']
114- // todo: try to scrape out a timestamp for $item['timestamp'], it's not in the meta tags
125+
126+ // Extract timestamp from profile JSON data (taken_at is a Unix timestamp)
127+ if (isset ($ postData ['taken_at ' ]) && $ postData ['taken_at ' ]) {
128+ $ item ['timestamp ' ] = $ postData ['taken_at ' ];
129+ }
115130
116131 $ this ->items [] = $ item ;
117132 }
0 commit comments