# See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file # User-agent: * Disallow: /download_tokens/ Disallow: /orders/ Disallow: /carts/ Disallow: /cart/ Disallow: /my/ Disallow: /me/ # duplicate content Disallow: /payloads/ Disallow: /package_formats/ Disallow: /notes/ Disallow: /score_metrics/ # old site urls Disallow: /dataset/ Disallow: /contributor/ Disallow: /contributors/ Disallow: /fields/ Disallow: /field/ Disallow: /dl/ # non-content Disallow: /login Disallow: /logout Disallow: /signup Disallow: /stylesheets/ Disallow: /javascripts/ Disallow: /flash/ Disallow: /opensearch.xml Disallow: /404.html Disallow: /500.html Allow: / # wait 2 seconds between requests # maximum rate is one page every 2 seconds # only crawl between 9:00PM (EST) and 7:30AM (EST) -- that's 1:00 AM to 11:30 AM UTC # Request rate and visit time not understood by google's bot, doesn't seem to produce adverse effects though. # this is part of the Extended standard of robots.txt User-agent: * Crawl-delay: 1 Request-rate: 1/2 Visit-time: 0100-1130 # # If you're interested in getting a large quantity of data please contact # us and we'll get it to you. Don't hammer the servers! # # If you're using wget, # wget --wait=1 --limit-rate=50k --tries 2 # will help lower your impact and will respect the above crawl directives # # Thanks. #