From c9052e0ba7ac20def47e396e6f5757908825d8ef Mon Sep 17 00:00:00 2001 From: Mincka Date: Sun, 21 Oct 2018 20:53:41 +0200 Subject: [PATCH] Fixes #43 --- README.md | 25 ++++++++++++++++--------- dmarchiver/__init__.py | 2 +- dmarchiver/cmdline.py | 11 +++++++++-- dmarchiver/core.py | 18 +++++++++++------- 4 files changed, 37 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 433c455..04d501f 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,6 @@ # DMArchiver A tool to archive **all** the direct messages from your private conversations on Twitter. -## Warning: possible account lockout -Users are starting to report account lockouts because of the use of this tool. Twitter seems to lock accounts more aggressively if a new login context is detected. Even though locking can be reverted, you should be aware of this risk when using this tool. An additional attempt after unlocking can allow the tool to perform better on the second run. - ## Introduction Have you ever need to retrieve old information from a chat with your friends on Twitter? Or maybe you would just like to backup all these cheerful moments and keep them safe. @@ -27,12 +24,18 @@ The script does not leverage the Twitter API because of its very restrictive lim Because it is still possible to retrieve older messages from a Conversation by scrolling up, this script only simulates this behavior to automatically get the messages. -**Warning:** -Because this script leverages an unsupported method to retrieve the tweets, it may break at any time. Indeed, Twitter may change the output code without warning. If you get errors you did not have previously, please check if new releases of the tool are available. +**Warning: possible account lockout** + +A few users have reported account lockouts because of the use of this tool. Twitter seems to lock accounts more aggressively if a new login context is detected. Even though locking can be reverted, you should be aware of this risk when using this tool. An additional attempt after unlocking can allow the tool to perform better on the second run. + +If you need to run the tool multiple times, it is also recommended to use the `-s` parameter to reuse cookies from a previous session. You will not receive a new login warning by e-mail since the tool will reuse an existing session. **Disclaimer:** + Using this tool will only behave like you using the Twitter web site with your browser, so there is nothing illegal to use it to retrieve your own data. However, depending on your conversations' length, it may trigger a lot of requests to the site that could be suspicious for Twitter. In this case, Twitter could lock preemptively the account. +Because this script leverages an unsupported method to retrieve the tweets, it may break at any time. Indeed, Twitter may change the output code without warning. If you get errors you did not have previously, please check if new releases of the tool are available. + ## Installation & Quick start By running the tool without any argument, you will be only prompted for your username and your password. The script will retrieve all the messages, from all the conversations without the images or the GIFs. @@ -97,6 +100,8 @@ $ dmarchiver --help -dg, --download-gifs Download GIFs (as MP4) -dg, --download-videos Download videos (as MP4) + -th, --twitter-handle + Use the Twitter handles instead of the display names -r, --raw-output Write the raw HTML to a file ``` @@ -111,14 +116,14 @@ The script output will be the `645754097571131337.txt` file with the conversatio The images and videos files can be respectively found in the `645754097571131337/images` and `645754097571131337/mp4-*` folders. -#### Archive a specific conversation: +#### Archive a specific conversation, and use the Twitter handles for the usernames: To retrieve only one conversation with the ID `645754097571131337`: ``` -$ dmarchiver -id "645754097571131337" +$ dmarchiver -id "645754097571131337" -th ``` -The script output will be the `645754097571131337.txt` file with the conversation formatted in an _IRC-like_ style. +The script output will be the `645754097571131337.txt` file with the conversation formatted in an _IRC-like_ style, using the Twitter handles instead of the display names. #### How to get a `conversation_id`? @@ -145,9 +150,11 @@ for (var i = 0; i < conversations.length; i++) { You can also specify the username and the password in the options. Because DMArchiver is able to perform incremental updates, you can schedule a task or create a shortcut with the following arguments: ``` -$ dmarchiver -id "conversation_id" -di -dg -dv -u your_username -p your_password +$ dmarchiver -id "conversation_id" -di -dg -dv -u your_username -p your_password -s ``` +Note the usage of the `-s` flag to use an existing session, instead of creating a new one. + ## Development ### Ubuntu / Windows diff --git a/dmarchiver/__init__.py b/dmarchiver/__init__.py index f3e7ae3..c6f931f 100644 --- a/dmarchiver/__init__.py +++ b/dmarchiver/__init__.py @@ -5,4 +5,4 @@ without having to deal with the API limitations. """ -__version__ = "0.2.4" +__version__ = "0.2.5" diff --git a/dmarchiver/cmdline.py b/dmarchiver/cmdline.py index 45a41d6..e6ecb49 100644 --- a/dmarchiver/cmdline.py +++ b/dmarchiver/cmdline.py @@ -19,6 +19,8 @@ -dg, --download-gifs Download GIFs (as MP4) -dv, --download-videos Download videos (as MP4) + -th, --twitter-handle + Use the Twitter handles instead of the display names -r, --raw-output Write the raw HTML to a file """ @@ -63,6 +65,11 @@ def main(): "--download-videos", help="Download videos (as MP4)", action="store_true") + parser.add_argument( + "-th", + "--twitter-handle", + help="Use the Twitter handles instead of the display names", + action="store_true") parser.add_argument( "-r", "--raw-output", @@ -106,7 +113,7 @@ def main(): conversation_id, args.delay, args.download_images, - args.download_gifs, args.download_videos, args.raw_output) + args.download_gifs, args.download_videos, args.twitter_handle, args.raw_output) else: print('Conversation ID not specified. Retrieving all the threads.') threads = crawler.get_threads(args.delay, args.raw_output) @@ -114,7 +121,7 @@ def main(): for thread_id in threads: crawler.crawl(thread_id, args.delay, args.download_images, - args.download_gifs, args.download_videos, args.raw_output) + args.download_gifs, args.download_videos, args.twitter_handle, args.raw_output) time.sleep(args.delay) except KeyboardInterrupt: print('Script execution interruption requested. Exiting.') diff --git a/dmarchiver/core.py b/dmarchiver/core.py index 4918dda..4b6b08f 100644 --- a/dmarchiver/core.py +++ b/dmarchiver/core.py @@ -582,7 +582,7 @@ def _parse_dm_card(self, element): card.get('data-card-url'), card.get('data-card-name')) - def _process_tweets(self, tweets, download_images, download_gifs, download_videos, max_id): + def _process_tweets(self, tweets, download_images, download_gifs, download_videos, twitter_handle, max_id): conversation_set = collections.OrderedDict() ordered_tweets = sorted(tweets, reverse=True) @@ -617,11 +617,14 @@ def _process_tweets(self, tweets, download_images, download_gifs, download_video 'div.DMConversationEntry') if len(dm_container) > 0: - dm_avatar = dm_container[0].cssselect( - 'img.DMAvatar-image')[0] - dm_author = dm_avatar.get('alt') - - # print(dm_author) + if twitter_handle: + dm_avatar = dm_container[0].cssselect( + 'div.DirectMessage-avatar a')[0] + dm_author = dm_avatar.get('href')[1:] + else: + dm_avatar = dm_container[0].cssselect( + 'img.DMAvatar-image')[0] + dm_author = dm_avatar.get('alt') dm_footer = document.cssselect('div.DirectMessage-footer') time_stamp = dm_footer[0].cssselect('span._timestamp')[ @@ -684,6 +687,7 @@ def crawl( download_images=False, download_gifs=False, download_videos=False, + twitter_handle=False, raw_output=False): raw_output_file = None @@ -740,7 +744,7 @@ def crawl( # Get tweets for the current request conversation_set = self._process_tweets( - tweets, download_images, download_gifs, download_videos, max_id) + tweets, download_images, download_gifs, download_videos, twitter_handle, max_id) # Append to the whole conversation for tweet_id in conversation_set: