[extractor/GoogleDrive] Add folder extractor (#4009)

Closes #3388
Authored by: evansp, pukkandan
This commit is contained in:
Evan Spensley 2022-06-14 09:33:29 -04:00 committed by GitHub
parent 2cb1982043
commit 145c5a83a8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 61 additions and 2 deletions

View file

@ -1188,7 +1188,7 @@ class InfoExtractor:
self.report_warning('unable to extract %s' % _name + bug_reports_message()) self.report_warning('unable to extract %s' % _name + bug_reports_message())
return None return None
def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', contains_pattern='.+', fatal=True, **kwargs): def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', contains_pattern='(?s:.+)', fatal=True, **kwargs):
"""Searches string for the JSON object specified by start_pattern""" """Searches string for the JSON object specified by start_pattern"""
# NB: end_pattern is only used to reduce the size of the initial match # NB: end_pattern is only used to reduce the size of the initial match
return self._parse_json( return self._parse_json(

View file

@ -597,7 +597,10 @@ from .godtube import GodTubeIE
from .gofile import GofileIE from .gofile import GofileIE
from .golem import GolemIE from .golem import GolemIE
from .goodgame import GoodGameIE from .goodgame import GoodGameIE
from .googledrive import GoogleDriveIE from .googledrive import (
GoogleDriveIE,
GoogleDriveFolderIE,
)
from .googlepodcasts import ( from .googlepodcasts import (
GooglePodcastsIE, GooglePodcastsIE,
GooglePodcastsFeedIE, GooglePodcastsFeedIE,

View file

@ -276,3 +276,59 @@ class GoogleDriveIE(InfoExtractor):
'automatic_captions': self.extract_automatic_captions( 'automatic_captions': self.extract_automatic_captions(
video_id, subtitles_id, hl), video_id, subtitles_id, hl),
} }
class GoogleDriveFolderIE(InfoExtractor):
IE_NAME = 'GoogleDrive:Folder'
_VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/folders/(?P<id>[\w-]{28,})'
_TESTS = [{
'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
'info_dict': {
'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
'title': 'Forrest'
},
'playlist_count': 3,
}]
_BOUNDARY = '=====vc17a3rwnndj====='
_REQUEST = "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1"
_DATA = f'''--{_BOUNDARY}
content-type: application/http
content-transfer-encoding: binary
GET %s
--{_BOUNDARY}
'''
def _call_api(self, folder_id, key, data, **kwargs):
response = self._download_webpage(
'https://clients6.google.com/batch/drive/v2beta',
folder_id, data=data.encode('utf-8'),
headers={
'Content-Type': 'text/plain;charset=UTF-8;',
'Origin': 'https://drive.google.com',
}, query={
'$ct': f'multipart/mixed; boundary="{self._BOUNDARY}"',
'key': key
}, **kwargs)
return self._search_json('', response, 'api response', folder_id, **kwargs) or {}
def _get_folder_items(self, folder_id, key):
page_token = ''
while page_token is not None:
request = self._REQUEST.format(folder_id=folder_id, page_token=page_token, key=key)
page = self._call_api(folder_id, key, self._DATA % request)
yield from page['items']
page_token = page.get('nextPageToken')
def _real_extract(self, url):
folder_id = self._match_id(url)
webpage = self._download_webpage(url, folder_id)
key = self._search_regex(r'"(\w{39})"', webpage, 'key')
folder_info = self._call_api(folder_id, key, self._DATA % f'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal=False)
return self.playlist_from_matches(
self._get_folder_items(folder_id, key), folder_id, folder_info.get('title'),
ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item["id"]}')