diff --git a/README.md b/README.md index e3545c2..d5410e5 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ I have made this tool to retrieve all the tweets from my private conversations a [2016-09-07 10:38:10] You guys are ridiculous! 😂 ``` -This tool is also able to **download all the uploaded images** in their original resolution and, as a bonus, also retrieve the **GIFs** you used in your conversations as MP4 files (the format used by Twitter to optimize them and save space). +This tool is also able to **download all the uploaded images and videos** in their original resolution and, as a bonus, also retrieve the **GIFs** you used in your conversations as MP4 files (the format used by Twitter to optimize them and save space). You may have found suggestions to use the Twitter's archive feature to do the same but Direct Messages are not included in the generated archive. @@ -76,31 +76,33 @@ $ pip3 install dmarchiver --upgrade ### Command line tool ``` -$ dmarchiver [-h] [-id CONVERSATION_ID] [-di] [-dg] +$ dmarchiver [-h] [-id CONVERSATION_ID] [-u] [-p] [-di] [-dg] [-dv] $ dmarchiver --help - usage: cmdline.py [-h] [-id CONVERSATION_ID] [-di] [-dg] + usage: cmdline.py [-h] [-id CONVERSATION_ID] [-u] [-p] [-di] [-dg] [-dv] optional arguments: -h, --help show this help message and exit -id CONVERSATION_ID, --conversation_id CONVERSATION_ID Conversation ID - -u, --username Username (e-mail or handle) - -p, --password Password + -u, --username Username (e-mail or handle) + -p, --password Password -di, --download-images Download images -dg, --download-gifs Download GIFs (as MP4) - -r, --raw-output Write the raw HTML to a file + -dg, --download-videos + Download videos (as MP4) + -r, --raw-output Write the raw HTML to a file ``` ### Examples -#### Archive all conversations with images: -`$ dmarchiver -di` +#### Archive all conversations with images and videos: +`$ dmarchiver -di -dv` The script output will be the `645754097571131337.txt` file with the conversation formatted in an _IRC-like_ style. -The images and GIFs files can be respectively found in the `645754097571131337/images` and `645754097571131337/mp4` folders. +The images and videos files can be respectively found in the `645754097571131337/images` and `645754097571131337/mp4-*` folders. #### Archive a specific conversation: To retrieve only one conversation with the ID `645754097571131337`: @@ -133,7 +135,7 @@ for (var i = 0; i < conversations.length; i++) { #### Schedule a task to perform incremental backups of a conversation You can also specify the username and the password in the options. Because DMArchiver is able to perform incremental updates, you can schedule a task or create a shortcut with the following arguments: -`$ dmarchiver -id "conversation_id" -di -dg -u your_username -p your_password` +`$ dmarchiver -id "conversation_id" -di -dg -dv -u your_username -p your_password` ### Module import ```python diff --git a/dmarchiver/__init__.py b/dmarchiver/__init__.py index f73e902..db4428b 100644 --- a/dmarchiver/__init__.py +++ b/dmarchiver/__init__.py @@ -5,4 +5,4 @@ without having to deal with the API limitations. """ -__version__ = "0.1.6" +__version__ = "0.1.7" diff --git a/dmarchiver/cmdline.py b/dmarchiver/cmdline.py index cafa3cc..ad9a25b 100644 --- a/dmarchiver/cmdline.py +++ b/dmarchiver/cmdline.py @@ -4,7 +4,7 @@ Direct Messages Archiver - Command Line Usage: - # dmarchiver [-h] [-id CONVERSATION_ID] [-di] [-dg] + # dmarchiver [-h] [-id CONVERSATION_ID] [-u] [-p] [-di] [-dg] [-dv] optional arguments: -h, --help show this help message and exit @@ -15,6 +15,8 @@ -di, --download-images Download images -dg, --download-gifs Download GIFs (as MP4) + -dv, --download-videos + Download videos (as MP4) -r, --raw-output Write the raw HTML to a file """ @@ -44,6 +46,11 @@ def main(): "--download-gifs", help="Download GIFs (as MP4)", action="store_true") + parser.add_argument( + "-dv", + "--download-videos", + help="Download videos (as MP4)", + action="store_true") parser.add_argument( "-r", "--raw-output", @@ -70,26 +77,31 @@ def main(): print('Error: {0}'.format(err.args[0])) print('Exiting.') sys.exit() + + print('Press Ctrl+C at anytime to write the current conversation and skip to the next one.\n Keep it pressed to exit the script.\n') - if args.conversation_id is not None: - # Prevent error when using '' instead of "" - conversation_id = args.conversation_id.strip('\'') - print( - 'Conversation ID specified ({0}). Retrieving only one thread.'.format( - args.conversation_id)) - crawler.crawl( - conversation_id, - args.download_images, - args.download_gifs, args.raw_output) - else: - print('Conversation ID not specified. Retrieving all the threads.') - threads = crawler.get_threads() - print('{0} thread(s) found.'.format(len(threads))) - - for thread_id in threads: - crawler.crawl(thread_id, args.download_images, - args.download_gifs, args.raw_output) + try: + if args.conversation_id is not None: + # Prevent error when using '' instead of "" + conversation_id = args.conversation_id.strip('\'') + print( + 'Conversation ID specified ({0}). Retrieving only one thread.'.format( + args.conversation_id)) + crawler.crawl( + conversation_id, + args.download_images, + args.download_gifs, args.raw_output) + else: + print('Conversation ID not specified. Retrieving all the threads.') + threads = crawler.get_threads() + print('{0} thread(s) found.'.format(len(threads))) + for thread_id in threads: + crawler.crawl(thread_id, args.download_images, + args.download_gifs, args.download_videos, args.raw_output) + except KeyboardInterrupt: + print('Script execution interruption requested. Exiting.') + sys.exit() if __name__ == "__main__": main() diff --git a/dmarchiver/core.py b/dmarchiver/core.py index 094e8ea..0a1f73c 100644 --- a/dmarchiver/core.py +++ b/dmarchiver/core.py @@ -389,7 +389,8 @@ def _parse_dm_media( tweet_id, time_stamp, download_images, - download_gif): + download_gifs, + download_videos): media_url = '' media_preview_url = '' media_alt = '' @@ -438,12 +439,12 @@ def _parse_dm_media( media_filename = '{0}-{1}-{2}'.format(formatted_timestamp, media_filename_re[0][ 0], media_filename_re[0][1]) - if download_gif: + if download_gifs: response = self._session.get(media_url, stream=True) if response.status_code == 200: os.makedirs( - '{0}/mp4'.format(self._conversation_id), exist_ok=True) - with open('{0}/mp4/{1}'.format(self._conversation_id, media_filename), 'wb') as file: + '{0}/mp4-gifs'.format(self._conversation_id), exist_ok=True) + with open('{0}/mp4-gifs/{1}'.format(self._conversation_id, media_filename), 'wb') as file: response.raw.decode_content = True shutil.copyfileobj(response.raw, file) elif len(video_url) > 0: @@ -451,6 +452,19 @@ def _parse_dm_media( media_style = video_url[0].find('div').get('style') media_preview_url = re.findall('url\(\'(.*?)\'\)', media_style)[0] media_url = 'https://twitter.com/i/videos/dm/' + tweet_id + video_url = 'https://mobile.twitter.com/messages/media/' + tweet_id + media_filename = '{0}-{1}.mp4'.format( + formatted_timestamp, tweet_id) + + if download_videos: + response = self._session.get(video_url, stream=True) + if response.status_code == 200: + os.makedirs( + '{0}/mp4-videos'.format(self._conversation_id), exist_ok=True) + with open('{0}/mp4-videos/{1}'.format(self._conversation_id, media_filename), 'wb') as file: + response.raw.decode_content = True + shutil.copyfileobj(response.raw, file) + else: print('Unknown media') @@ -471,7 +485,7 @@ def _parse_dm_card(self, element): card.get('data-card-url'), card.get('data-card-name')) - def _process_tweets(self, tweets, download_images, download_gif, max_id): + def _process_tweets(self, tweets, download_images, download_gifs, download_videos, max_id): conversation_set = collections.OrderedDict() ordered_tweets = sorted(tweets, reverse=True) @@ -534,7 +548,7 @@ def _process_tweets(self, tweets, download_images, download_gif, max_id): message.elements.append(element_object) elif 'DirectMessage-media' in dm_element_type: element_object = self._parse_dm_media( - dm_element, tweet_id, time_stamp, download_images, download_gif) + dm_element, tweet_id, time_stamp, download_images, download_gifs, download_videos) message.elements.append(element_object) elif 'DirectMessage-tweet' in dm_element_type: element_object = self._parse_dm_tweet(dm_element) @@ -548,6 +562,11 @@ def _process_tweets(self, tweets, download_images, download_gif, max_id): elif len(dm_conversation_entry) > 0: dm_element_text = dm_conversation_entry[0].text.strip() message = DMConversationEntry(tweet_id, dm_element_text) + except KeyboardInterrupt: + print( + 'Script execution interruption requested. Writing the conversation.') + self._max_id_found = True + break except: print( 'Unexpected error for tweet \'{0}\', raw HTML will be used for the tweet.'.format(tweet_id)) @@ -564,7 +583,8 @@ def crawl( self, conversation_id, download_images=False, - download_gif=False, + download_gifs=False, + download_videos=False, raw_output=False): raw_output_file = None @@ -585,38 +605,42 @@ def crawl( payload = {'id': conversation_id} processed_tweet_counter = 0 - while True and self._max_id_found == False: - response = self._session.get( - conversation_url, - headers=self._ajax_headers, - params=payload) - - json = response.json() - - if 'max_entry_id' not in json: - print('Begin of thread reached') - break - - payload = {'id': conversation_id, - 'max_entry_id': json['min_entry_id']} - - tweets = json['items'] - - if raw_output: - ordered_tweets = sorted(tweets, reverse=True) - for tweet_id in ordered_tweets: - raw_output_file.write(tweets[tweet_id].encode('UTF-8')) - - # Get tweets for the current request - conversation_set = self._process_tweets( - tweets, download_images, download_gif, max_id) - - # Append to the whole conversation - for tweet_id in conversation_set: - processed_tweet_counter += 1 - conversation.tweets[tweet_id] = conversation_set[tweet_id] - print('Processed tweets: {0}\r'.format( - processed_tweet_counter), end='') + try: + while True and self._max_id_found == False: + response = self._session.get( + conversation_url, + headers=self._ajax_headers, + params=payload) + + json = response.json() + + if 'max_entry_id' not in json: + print('Begin of thread reached') + break + + payload = {'id': conversation_id, + 'max_entry_id': json['min_entry_id']} + + tweets = json['items'] + + if raw_output: + ordered_tweets = sorted(tweets, reverse=True) + for tweet_id in ordered_tweets: + raw_output_file.write(tweets[tweet_id].encode('UTF-8')) + + # Get tweets for the current request + conversation_set = self._process_tweets( + tweets, download_images, download_gifs, download_videos, max_id) + + # Append to the whole conversation + for tweet_id in conversation_set: + processed_tweet_counter += 1 + conversation.tweets[tweet_id] = conversation_set[tweet_id] + print('Processed tweets: {0}\r'.format( + processed_tweet_counter), end='') + except KeyboardInterrupt: + print( + 'Script execution interruption requested. Writing this conversation.') if raw_output: raw_output_file.close() diff --git a/setup.py b/setup.py index 555f599..48289d7 100644 --- a/setup.py +++ b/setup.py @@ -1,27 +1,27 @@ -from setuptools import setup, find_packages - +import codecs +from setuptools import setup, find_packages import dmarchiver setup( - + name='dmarchiver', version=dmarchiver.__version__, - + packages=find_packages(), - install_requires=['requests==2.11.1', 'lxml==3.6.4', 'cssselect==0.9.2'], + install_requires=['requests==2.11.1', 'lxml==3.6.4', 'cssselect==0.9.2'], author="Julien EHRHART", author_email="julien.ehrhart@live.com", - + description="A tool to archive the direct messages from your private conversations on Twitter.", - - long_description=open('README.md').read(), - + + long_description=codecs.open('README.md', 'r', 'utf-8').read(), + include_package_data=True, - + url='https://github.com/Mincka/DMArchiver', - + classifiers=[ "Programming Language :: Python", "Development Status :: 3 - Alpha", @@ -32,12 +32,12 @@ "Topic :: System :: Archiving", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", ], - - entry_points = { + + entry_points={ 'console_scripts': [ 'dmarchiver = dmarchiver.cmdline:main', ], }, - + license="GNU General Public License v3 (GPLv3)", )