# See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file
#
User-agent: *
Disallow: /download_tokens/
Disallow: /orders/
Disallow: /carts/
Disallow: /cart/
Disallow: /my/
Disallow: /me/
# duplicate content
Disallow: /payloads/
Disallow: /package_formats/
Disallow: /notes/
Disallow: /score_metrics/
# old site urls
Disallow: /dataset/
Disallow: /contributor/
Disallow: /contributors/
Disallow: /fields/
Disallow: /field/
Disallow: /dl/
# non-content
Disallow: /login
Disallow: /logout
Disallow: /signup
Disallow: /stylesheets/
Disallow: /javascripts/
Disallow: /flash/
Disallow: /opensearch.xml
Disallow: /404.html
Disallow: /500.html
Allow: /
# wait 2 seconds between requests
# maximum rate is one page every 2 seconds
# only crawl between 9:00PM (EST) and 7:30AM (EST) -- that's 1:00 AM to 11:30 AM UTC
# Request rate and visit time not understood by google's bot, doesn't seem to produce adverse effects though.
# this is part of the Extended standard of robots.txt
User-agent: *
Crawl-delay: 1
Request-rate: 1/2
Visit-time: 0100-1130
#
# If you're interested in getting a large quantity of data please contact
# us and we'll get it to you. Don't hammer the servers!
#
# If you're using wget,
# wget --wait=1 --limit-rate=50k --tries 2
# will help lower your impact and will respect the above crawl directives
#
# Thanks.
#